xSKu9Auz

· 8 years ago · May 28, 2017, 04:06 PM
1From f3677c61bc31dbe79d69dee092cba504c3f6f523 Mon Sep 17 00:00:00 2001
2From: Jason Garrett-Glaser <darkshikari@gmail.com>
3Date: Mon, 31 May 2010 11:14:22 -0700
4Subject: [PATCH 01/10] Fix cavlc+deblock+8x8dct (regression in r1612)
5 Add cavlc+8x8dct munging to new deblock system.
6 May have caused minor visual artifacts.
7
8---
9 common/deblock.c    |   47 -----------------------------------------------
10 common/macroblock.c |   46 ++++++++++++++++++++++++++++++++++++++++++++--
11 2 files changed, 44 insertions(+), 49 deletions(-)
12
13diff --git a/common/deblock.c b/common/deblock.c
14index fc039c5..27c73ae 100644
15--- a/common/deblock.c
16+++ b/common/deblock.c
17@@ -24,46 +24,6 @@
18 
19 #include "common.h"
20 
21-/* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
22- * entropy coding, but per 64 coeffs for the purpose of deblocking */
23-static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
24-{
25-    uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
26-    int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
27-    for( int x = 0; x<h->sps->i_mb_width; x++ )
28-    {
29-        memcpy( buf+x, src+x, 16 );
30-        if( transform[x] )
31-        {
32-            int nnz = src[x][0] | src[x][1];
33-            src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
34-            nnz = src[x][2] | src[x][3];
35-            src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
36-        }
37-    }
38-}
39-
40-static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
41-{
42-    uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
43-    for( int x = 0; x < h->sps->i_mb_width; x++ )
44-        memcpy( dst+x, buf+x, 16 );
45-}
46-
47-static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
48-{
49-    func( h, mb_y, buf );
50-    if( mb_y > 0 )
51-        func( h, mb_y-1, buf + h->sps->i_mb_width );
52-    if( h->sh.b_mbaff )
53-    {
54-        func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
55-        if( mb_y > 0 )
56-            func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
57-    }
58-}
59-
60-
61 /* Deblocking filter */
62 static const uint8_t i_alpha_table[52+12*2] =
63 {
64@@ -344,10 +304,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
65     int stride2y  = stridey << b_interlaced;
66     int strideuv  = h->fdec->i_stride[1];
67     int stride2uv = strideuv << b_interlaced;
68-    uint8_t (*nnz_backup)[16] = h->scratch_buffer;
69-
70-    if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
71-        munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
72 
73     for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
74     {
75@@ -427,9 +383,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
76             if( !transform_8x8 ) FILTER( , 1, 3, qp, qpc );
77         }
78     }
79-
80-    if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
81-        munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
82 }
83 
84 #ifdef HAVE_MMX
85diff --git a/common/macroblock.c b/common/macroblock.c
86index ce510e9..1b2d37b 100644
87--- a/common/macroblock.c
88+++ b/common/macroblock.c
89@@ -344,8 +344,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
90         int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
91         int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
92             ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
93-        int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
94-        scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_nnz );
95+        scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
96     }
97     int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
98     scratch_size = X264_MAX( scratch_size, buf_mbtree );
99@@ -1013,6 +1012,49 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
100         M32( &h->mb.cache.ref[0][x264_scan8[0]+8*2] ) = refbot;
101         M32( &h->mb.cache.ref[0][x264_scan8[0]+8*3] ) = refbot;
102     }
103+
104+    /* Munge NNZ for cavlc + 8x8dct */
105+    if( !h->param.b_cabac && h->pps->b_transform_8x8_mode )
106+    {
107+        uint8_t (*nnz)[24] = h->mb.non_zero_count;
108+        int top = h->mb.i_mb_top_xy;
109+        int left = h->mb.i_mb_left_xy;
110+
111+        if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] )
112+        {
113+            int i8 = x264_scan8[0] - 8;
114+            int nnz_top0 = M16( &nnz[top][8] ) | M16( &nnz[top][12] );
115+            int nnz_top1 = M16( &nnz[top][10] ) | M16( &nnz[top][14] );
116+            M16( &h->mb.cache.non_zero_count[i8+0] ) = nnz_top0 ? 0x0101 : 0;
117+            M16( &h->mb.cache.non_zero_count[i8+2] ) = nnz_top1 ? 0x0101 : 0;
118+        }
119+
120+        if( h->mb.i_neighbour & MB_LEFT && h->mb.mb_transform_size[left] )
121+        {
122+            int i8 = x264_scan8[0] - 1;
123+            int nnz_left0 = M16( &nnz[left][2] ) | M16( &nnz[left][6] );
124+            int nnz_left1 = M16( &nnz[left][10] ) | M16( &nnz[left][14] );
125+            h->mb.cache.non_zero_count[i8+8*0] = !!nnz_left0;
126+            h->mb.cache.non_zero_count[i8+8*1] = !!nnz_left0;
127+            h->mb.cache.non_zero_count[i8+8*2] = !!nnz_left1;
128+            h->mb.cache.non_zero_count[i8+8*3] = !!nnz_left1;
129+        }
130+
131+        if( h->mb.mb_transform_size[h->mb.i_mb_xy] )
132+        {
133+            int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
134+            int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] );
135+            int nnz2 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[10]] );
136+            int nnz3 = M16( &h->mb.cache.non_zero_count[x264_scan8[12]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[14]] );
137+            uint32_t nnztop = pack16to32( !!nnz0, !!nnz1 ) * 0x0101;
138+            uint32_t nnzbot = pack16to32( !!nnz2, !!nnz3 ) * 0x0101;
139+
140+            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*0] ) = nnztop;
141+            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*1] ) = nnztop;
142+            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*2] ) = nnzbot;
143+            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*3] ) = nnzbot;
144+        }
145+    }
146 }
147 
148 static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i )
149-- 
1501.7.0.4
151
152
153From 925b5fd15ac24ccbce54f5e2ff6119f8f4f4710c Mon Sep 17 00:00:00 2001
154From: Jason Garrett-Glaser <darkshikari@gmail.com>
155Date: Sun, 30 May 2010 09:42:53 -0700
156Subject: [PATCH 02/10] Fix ultrafast to actually turn off weightb
157
158---
159 common/common.c |    1 +
160 1 files changed, 1 insertions(+), 0 deletions(-)
161
162diff --git a/common/common.c b/common/common.c
163index 62bef99..fccf2b0 100644
164--- a/common/common.c
165+++ b/common/common.c
166@@ -183,6 +183,7 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
167         param->i_bframe_adaptive = X264_B_ADAPT_NONE;
168         param->rc.b_mb_tree = 0;
169         param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
170+        param->analyse.b_weighted_bipred = 0;
171     }
172     else if( !strcasecmp( preset, "superfast" ) )
173     {
174-- 
1751.7.0.4
176
177
178From 49a832188629fdea4269977a48102029a6300b8b Mon Sep 17 00:00:00 2001
179From: Jason Garrett-Glaser <darkshikari@gmail.com>
180Date: Thu, 27 May 2010 12:31:41 -0700
181Subject: [PATCH 03/10] Fix omission in libx264 tuning documentation
182
183---
184 x264.h |    2 +-
185 1 files changed, 1 insertions(+), 1 deletions(-)
186
187diff --git a/x264.h b/x264.h
188index 6d7b703..95efd88 100644
189--- a/x264.h
190+++ b/x264.h
191@@ -446,7 +446,7 @@ static const char * const x264_tune_names[] = { "film", "animation", "grain", "s
192 
193 /*      Multiple tunings can be used if separated by a delimiter in ",./-+",
194  *      however multiple psy tunings cannot be used.
195- *      film, animation, grain, psnr, and ssim are psy tunings.
196+ *      film, animation, grain, stillimage, psnr, and ssim are psy tunings.
197  *
198  *      returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
199 int     x264_param_default_preset( x264_param_t *, const char *preset, const char *tune );
200-- 
2011.7.0.4
202
203
204From 69cda7770f3851d2c5785af74b82ba583794c7a6 Mon Sep 17 00:00:00 2001
205From: Jason Garrett-Glaser <darkshikari@gmail.com>
206Date: Wed, 26 May 2010 12:55:35 -0700
207Subject: [PATCH 04/10] Merge some of adaptive quant and weightp
208 Eliminate redundant work; both of them were calculating variance of the frame.
209
210---
211 common/frame.h        |    4 +-
212 encoder/analyse.h     |    1 -
213 encoder/encoder.c     |   12 ++---
214 encoder/ratecontrol.c |  124 +++++++++++++++++++++++++++++++-----------------
215 encoder/slicetype.c   |   31 ++----------
216 5 files changed, 92 insertions(+), 80 deletions(-)
217
218diff --git a/common/frame.h b/common/frame.h
219index 91d27b5..ca5cb7a 100644
220--- a/common/frame.h
221+++ b/common/frame.h
222@@ -118,8 +118,8 @@ typedef struct x264_frame
223     uint16_t *i_inv_qscale_factor;
224     int     b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
225     float   f_weighted_cost_delta[X264_BFRAME_MAX+2];
226-    uint32_t i_pixel_sum;
227-    uint64_t i_pixel_ssd;
228+    uint32_t i_pixel_sum[3];
229+    uint64_t i_pixel_ssd[3];
230 
231     /* hrd */
232     x264_hrd_t hrd_timing;
233diff --git a/encoder/analyse.h b/encoder/analyse.h
234index 7c2c22c..53e4c2e 100644
235--- a/encoder/analyse.h
236+++ b/encoder/analyse.h
237@@ -33,7 +33,6 @@ void x264_slicetype_decide( x264_t *h );
238 void x264_slicetype_analyse( x264_t *h, int keyframe );
239 
240 int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
241-void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame );
242 
243 int  x264_lookahead_init( x264_t *h, int i_slicetype_length );
244 int  x264_lookahead_is_empty( x264_t *h );
245diff --git a/encoder/encoder.c b/encoder/encoder.c
246index 52017ff..6e0dc54 100644
247--- a/encoder/encoder.c
248+++ b/encoder/encoder.c
249@@ -2246,21 +2246,17 @@ int     x264_encoder_encode( x264_t *h,
250                 fenc->i_pic_struct = PIC_STRUCT_PROGRESSIVE;
251         }
252 
253-        if( h->frames.b_have_lowres )
254-        {
255-            if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
256-                x264_weight_plane_analyse( h, fenc );
257-            x264_frame_init_lowres( h, fenc );
258-        }
259-
260         if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
261         {
262             if( x264_macroblock_tree_read( h, fenc ) )
263                 return -1;
264         }
265-        else if( h->param.rc.i_aq_mode )
266+        else
267             x264_adaptive_quant_frame( h, fenc );
268 
269+        if( h->frames.b_have_lowres )
270+            x264_frame_init_lowres( h, fenc );
271+
272         /* 2: Place the frame into the queue for its slice type decision */
273         x264_lookahead_put_frame( h, fenc );
274 
275diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
276index a725a24..bf0a400 100644
277--- a/encoder/ratecontrol.c
278+++ b/encoder/ratecontrol.c
279@@ -215,12 +215,14 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
280     stride <<= h->mb.b_interlaced;
281     uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride );
282     uint32_t sum = (uint32_t)res;
283-    uint32_t sqr = res >> 32;
284-    return sqr - (sum * sum >> shift);
285+    uint32_t ssd = res >> 32;
286+    frame->i_pixel_sum[i] += sum;
287+    frame->i_pixel_ssd[i] += ssd;
288+    return ssd - (sum * sum >> shift);
289 }
290 
291 // Find the total AC energy of the block in all planes.
292-static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
293+static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
294 {
295     /* This function contains annoying hacks because GCC has a habit of reordering emms
296      * and putting it after floating point ops.  As a result, we put the emms at the end of the
297@@ -239,56 +241,90 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
298      * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
299     float strength;
300     float avg_adj = 0.f;
301-    /* Need to init it anyways for MB tree. */
302-    if( h->param.rc.f_aq_strength == 0 )
303-    {
304-        memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
305-        memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
306-        if( h->frames.b_have_lowres )
307-            for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
308-                frame->i_inv_qscale_factor[mb_xy] = 256;
309-        return;
310+    int width = h->sps->i_mb_width;
311+    int height = h->sps->i_mb_height;
312+    /* Initialize frame stats */
313+    for( int i = 0; i < 3; i++ )
314+    {
315+        frame->i_pixel_sum[i] = 0;
316+        frame->i_pixel_ssd[i] = 0;
317     }
318 
319-    if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
320+    /* Degenerate cases */
321+    if( h->param.rc.i_aq_mode == X264_AQ_NONE || h->param.rc.f_aq_strength == 0 )
322     {
323-        float avg_adj_pow2 = 0.f;
324-        for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
325-            for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
326-            {
327-                uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
328-                float qp_adj = powf( energy + 1, 0.125f );
329-                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
330-                avg_adj += qp_adj;
331-                avg_adj_pow2 += qp_adj * qp_adj;
332-            }
333-        avg_adj /= h->mb.i_mb_count;
334-        avg_adj_pow2 /= h->mb.i_mb_count;
335-        strength = h->param.rc.f_aq_strength * avg_adj;
336-        avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
337+        /* Need to init it anyways for MB tree */
338+        if( h->param.rc.f_aq_strength == 0 )
339+        {
340+            memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
341+            memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
342+            if( h->frames.b_have_lowres )
343+                for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
344+                    frame->i_inv_qscale_factor[mb_xy] = 256;
345+        }
346+        /* Need variance data for weighted prediction */
347+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
348+        {
349+            for( int mb_y = 0; mb_y < height; mb_y++ )
350+                for( int mb_x = 0; mb_x < width; mb_x++ )
351+                    x264_ac_energy_mb( h, mb_x, mb_y, frame );
352+        }
353+        else
354+            return;
355     }
356+    /* Actual adaptive quantization */
357     else
358-        strength = h->param.rc.f_aq_strength * 1.0397f;
359-
360-    for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
361-        for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
362+    {
363+        if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
364         {
365-            float qp_adj;
366-            if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
367-            {
368-                qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
369-                qp_adj = strength * (qp_adj - avg_adj);
370-            }
371-            else
372+            float avg_adj_pow2 = 0.f;
373+            for( int mb_y = 0; mb_y < height; mb_y++ )
374+                for( int mb_x = 0; mb_x < width; mb_x++ )
375+                {
376+                    uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
377+                    float qp_adj = powf( energy + 1, 0.125f );
378+                    frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
379+                    avg_adj += qp_adj;
380+                    avg_adj_pow2 += qp_adj * qp_adj;
381+                }
382+            avg_adj /= h->mb.i_mb_count;
383+            avg_adj_pow2 /= h->mb.i_mb_count;
384+            strength = h->param.rc.f_aq_strength * avg_adj;
385+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
386+        }
387+        else
388+            strength = h->param.rc.f_aq_strength * 1.0397f;
389+
390+        for( int mb_y = 0; mb_y < height; mb_y++ )
391+            for( int mb_x = 0; mb_x < width; mb_x++ )
392             {
393-                uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
394-                qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
395+                float qp_adj;
396+                if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
397+                {
398+                    qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
399+                    qp_adj = strength * (qp_adj - avg_adj);
400+                }
401+                else
402+                {
403+                    uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
404+                    qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
405+                }
406+                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
407+                frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
408+                if( h->frames.b_have_lowres )
409+                    frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
410             }
411-            frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
412-            frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
413-            if( h->frames.b_have_lowres )
414-                frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
415-        }
416+    }
417+
418+    /* Remove mean from SSD calculation */
419+    for( int i = 0; i < 3; i++ )
420+    {
421+        uint64_t ssd = frame->i_pixel_ssd[i];
422+        uint64_t sum = frame->i_pixel_sum[i];
423+        int w = width*16>>!!i;
424+        int h = height*16>>!!i;
425+        frame->i_pixel_ssd[i] = ssd - (sum * sum + w * h / 2) / (w * h);
426+    }
427 }
428 
429 int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
430diff --git a/encoder/slicetype.c b/encoder/slicetype.c
431index 9352367..e454e12 100644
432--- a/encoder/slicetype.c
433+++ b/encoder/slicetype.c
434@@ -67,25 +67,6 @@ static void x264_weight_get_h264( unsigned int weight_nonh264, int offset, x264_
435     w->i_scale = X264_MIN( w->i_scale, 127 );
436 }
437 
438-void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame )
439-{
440-    uint32_t sad = 0;
441-    uint64_t ssd = 0;
442-    uint8_t *p = frame->plane[0];
443-    int stride = frame->i_stride[0];
444-    int width = frame->i_width[0];
445-    int height = frame->i_lines[0];
446-    for( int y = 0; y < height>>4; y++, p += stride*16 )
447-        for( int x = 0; x < width; x += 16 )
448-        {
449-            uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride );
450-            sad += (uint32_t)res;
451-            ssd += res >> 32;
452-        }
453-    frame->i_pixel_sum = sad;
454-    frame->i_pixel_ssd = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
455-}
456-
457 static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, uint8_t *dest )
458 {
459     int ref0_distance = fenc->i_frame - ref->i_frame - 1;
460@@ -167,10 +148,10 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
461     int found;
462     x264_weight_t *weights = fenc->weight[0];
463 
464-    fenc_var = round( sqrt( fenc->i_pixel_ssd ) );
465-    ref_var  = round( sqrt(  ref->i_pixel_ssd ) );
466-    fenc_mean = (float)fenc->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
467-    ref_mean  = (float) ref->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
468+    fenc_var = round( sqrt( fenc->i_pixel_ssd[0] ) );
469+    ref_var  = round( sqrt(  ref->i_pixel_ssd[0] ) );
470+    fenc_mean = (float)fenc->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
471+    ref_mean  = (float) ref->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
472 
473     //early termination
474     if( fabs( ref_mean - fenc_mean ) < 0.5 && fabs( 1 - fenc_var / ref_var ) < epsilon )
475@@ -534,8 +515,8 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
476         do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
477         if( do_search[0] )
478         {
479-            if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART
480-                  || h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
481+            if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ||
482+                  h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
483             {
484                 x264_emms();
485                 x264_weights_analyse( h, frames[b], frames[p0], 1 );
486-- 
4871.7.0.4
488
489
490From 0bf2d9e3e55fa6b1cda4ca2b1066c3034c575225 Mon Sep 17 00:00:00 2001
491From: Jason Garrett-Glaser <darkshikari@gmail.com>
492Date: Thu, 27 May 2010 10:42:15 -0700
493Subject: [PATCH 05/10] Add fast skip in lookahead motion search
494 Helps speed very significantly on motionless blocks.
495
496---
497 encoder/slicetype.c |   16 +++++++++++++++-
498 1 files changed, 15 insertions(+), 1 deletions(-)
499
500diff --git a/encoder/slicetype.c b/encoder/slicetype.c
501index e454e12..d7cfe5c 100644
502--- a/encoder/slicetype.c
503+++ b/encoder/slicetype.c
504@@ -379,11 +379,25 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
505                 CP32( m[l].mvp, mvc[0] );
506             else
507                 x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );
508-            x264_me_search( h, &m[l], mvc, i_mvc );
509 
510+            /* Fast skip for cases of near-zero residual.  Shortcut: don't bother except in the mv0 case,
511+             * since anything else is likely to have enough residual to not trigger the skip. */
512+            if( !M32( m[l].mvp ) )
513+            {
514+                m[l].cost = h->pixf.mbcmp[PIXEL_8x8]( m[l].p_fenc[0], FENC_STRIDE, m[l].p_fref[0], m[l].i_stride[0] );
515+                if( m[l].cost < 64 )
516+                {
517+                    M32( m[l].mv ) = 0;
518+                    goto skip_motionest;
519+                }
520+            }
521+
522+            x264_me_search( h, &m[l], mvc, i_mvc );
523             m[l].cost -= 2; // remove mvcost from skip mbs
524             if( M32( m[l].mv ) )
525                 m[l].cost += 5;
526+
527+skip_motionest:
528             CP32( fenc_mvs[l], m[l].mv );
529             *fenc_costs[l] = m[l].cost;
530         }
531-- 
5321.7.0.4
533
534
535From f6abca2c4c0e582d522e135773b88f1ab3d459d2 Mon Sep 17 00:00:00 2001
536From: Jason Garrett-Glaser <darkshikari@gmail.com>
537Date: Thu, 27 May 2010 14:27:32 -0700
538Subject: [PATCH 06/10] x86 assembly code for NAL escaping
539 Up to ~10x faster than C depending on CPU.
540 Helps the most at very high bitrates (e.g. lossless).
541 Also make the C code faster and simpler.
542
543---
544 Makefile                   |    4 +-
545 common/bitstream.c         |   92 ++++++++++++++
546 common/bitstream.h         |  299 ++++++++++++++++++++++++++++++++++++++++++++
547 common/bs.h                |  291 ------------------------------------------
548 common/common.c            |   54 --------
549 common/common.h            |    5 +-
550 common/x86/bitstream-a.asm |  112 +++++++++++++++++
551 common/x86/deblock-a.asm   |    1 +
552 encoder/encoder.c          |    3 +-
553 tools/checkasm.c           |   52 ++++++++-
554 10 files changed, 561 insertions(+), 352 deletions(-)
555 create mode 100644 common/bitstream.c
556 create mode 100644 common/bitstream.h
557 delete mode 100644 common/bs.h
558 create mode 100644 common/x86/bitstream-a.asm
559
560diff --git a/Makefile b/Makefile
561index 0b43a3e..519e181 100644
562--- a/Makefile
563+++ b/Makefile
564@@ -8,7 +8,7 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
565        common/frame.c common/dct.c common/cpu.c common/cabac.c \
566        common/common.c common/mdate.c common/rectangle.c \
567        common/set.c common/quant.c common/deblock.c common/vlc.c \
568-       common/mvpred.c \
569+       common/mvpred.c common/bitstream.c \
570        encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
571        encoder/set.c encoder/macroblock.c encoder/cabac.c \
572        encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
573@@ -52,7 +52,7 @@ endif
574 ifneq ($(AS),)
575 X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
576           mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
577-          cpu-a.asm dct-32.asm
578+          cpu-a.asm dct-32.asm bitstream-a.asm
579 X86SRC = $(X86SRC0:%=common/x86/%)
580 
581 ifeq ($(ARCH),X86)
582diff --git a/common/bitstream.c b/common/bitstream.c
583new file mode 100644
584index 0000000..0aaac21
585--- /dev/null
586+++ b/common/bitstream.c
587@@ -0,0 +1,92 @@
588+/*****************************************************************************
589+ * bitstream.c: h264 encoder library
590+ *****************************************************************************
591+ * Copyright (C) 2010 x264 project
592+ *
593+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
594+ *          Jason Garrett-Glaser <darkshikari@gmail.com>
595+ *
596+ * This program is free software; you can redistribute it and/or modify
597+ * it under the terms of the GNU General Public License as published by
598+ * the Free Software Foundation; either version 2 of the License, or
599+ * (at your option) any later version.
600+ *
601+ * This program is distributed in the hope that it will be useful,
602+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
603+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
604+ * GNU General Public License for more details.
605+ *
606+ * You should have received a copy of the GNU General Public License
607+ * along with this program; if not, write to the Free Software
608+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
609+ *****************************************************************************/
610+
611+#include "common.h"
612+
613+static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
614+{
615+    if( src < end ) *dst++ = *src++;
616+    if( src < end ) *dst++ = *src++;
617+    while( src < end )
618+    {
619+        if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
620+            *dst++ = 0x03;
621+        *dst++ = *src++;
622+    }
623+    return dst;
624+}
625+
626+#ifdef HAVE_MMX
627+uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
628+uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
629+#endif
630+
631+/****************************************************************************
632+ * x264_nal_encode:
633+ ****************************************************************************/
634+int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode )
635+{
636+    uint8_t *src = nal->p_payload;
637+    uint8_t *end = nal->p_payload + nal->i_payload;
638+    uint8_t *orig_dst = dst;
639+
640+    if( h->param.b_annexb )
641+    {
642+        if( b_long_startcode )
643+            *dst++ = 0x00;
644+        *dst++ = 0x00;
645+        *dst++ = 0x00;
646+        *dst++ = 0x01;
647+    }
648+    else /* save room for size later */
649+        dst += 4;
650+
651+    /* nal header */
652+    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
653+
654+    dst = h->bsf.nal_escape( dst, src, end );
655+    int size = (dst - orig_dst) - 4;
656+
657+    /* Write the size header for mp4/etc */
658+    if( !h->param.b_annexb )
659+    {
660+        /* Size doesn't include the size of the header we're writing now. */
661+        orig_dst[0] = size>>24;
662+        orig_dst[1] = size>>16;
663+        orig_dst[2] = size>> 8;
664+        orig_dst[3] = size>> 0;
665+    }
666+
667+    return size+4;
668+}
669+
670+void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
671+{
672+    pf->nal_escape = x264_nal_escape_c;
673+#ifdef HAVE_MMX
674+    if( cpu&X264_CPU_MMXEXT )
675+        pf->nal_escape = x264_nal_escape_mmxext;
676+    if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
677+        pf->nal_escape = x264_nal_escape_sse2;
678+#endif
679+}
680diff --git a/common/bitstream.h b/common/bitstream.h
681new file mode 100644
682index 0000000..d018c7d
683--- /dev/null
684+++ b/common/bitstream.h
685@@ -0,0 +1,299 @@
686+/*****************************************************************************
687+ * bitstream.h: h264 encoder library
688+ *****************************************************************************
689+ * Copyright (C) 2003-2008 x264 project
690+ *
691+ * Authors: Loren Merritt <lorenm@u.washington.edu>
692+ *          Jason Garrett-Glaser <darkshikari@gmail.com>
693+ *          Laurent Aimar <fenrir@via.ecp.fr>
694+ *
695+ * This program is free software; you can redistribute it and/or modify
696+ * it under the terms of the GNU General Public License as published by
697+ * the Free Software Foundation; either version 2 of the License, or
698+ * (at your option) any later version.
699+ *
700+ * This program is distributed in the hope that it will be useful,
701+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
702+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
703+ * GNU General Public License for more details.
704+ *
705+ * You should have received a copy of the GNU General Public License
706+ * along with this program; if not, write to the Free Software
707+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
708+ *****************************************************************************/
709+
710+#ifndef X264_BS_H
711+#define X264_BS_H
712+
713+typedef struct
714+{
715+    uint8_t i_bits;
716+    uint8_t i_size;
717+} vlc_t;
718+
719+typedef struct
720+{
721+    uint16_t i_bits;
722+    uint8_t  i_size;
723+    /* Next level table to use */
724+    uint8_t  i_next;
725+} vlc_large_t;
726+
727+typedef struct bs_s
728+{
729+    uint8_t *p_start;
730+    uint8_t *p;
731+    uint8_t *p_end;
732+
733+    intptr_t cur_bits;
734+    int     i_left;    /* i_count number of available bits */
735+    int     i_bits_encoded; /* RD only */
736+} bs_t;
737+
738+typedef struct
739+{
740+    int     last;
741+    int16_t level[16];
742+    uint8_t run[16];
743+} x264_run_level_t;
744+
745+extern const vlc_t x264_coeff0_token[5];
746+extern const vlc_t x264_coeff_token[5][16][4];
747+extern const vlc_t x264_total_zeros[15][16];
748+extern const vlc_t x264_total_zeros_dc[3][4];
749+extern const vlc_t x264_run_before[7][16];
750+
751+typedef struct
752+{
753+    uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end );
754+} x264_bitstream_function_t;
755+
756+int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode );
757+void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
758+
759+/* A larger level table size theoretically could help a bit at extremely
760+ * high bitrates, but the cost in cache is usually too high for it to be
761+ * useful.
762+ * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
763+ * FIXME: Do further testing? */
764+#define LEVEL_TABLE_SIZE 128
765+extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
766+
767+static inline void bs_init( bs_t *s, void *p_data, int i_data )
768+{
769+    int offset = ((intptr_t)p_data & 3);
770+    s->p       = s->p_start = (uint8_t*)p_data - offset;
771+    s->p_end   = (uint8_t*)p_data + i_data;
772+    s->i_left  = (WORD_SIZE - offset)*8;
773+    s->cur_bits = endian_fix32( M32(s->p) );
774+    s->cur_bits >>= (4-offset)*8;
775+}
776+static inline int bs_pos( bs_t *s )
777+{
778+    return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
779+}
780+
781+/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
782+static inline void bs_flush( bs_t *s )
783+{
784+    M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
785+    s->p += WORD_SIZE - s->i_left / 8;
786+    s->i_left = WORD_SIZE*8;
787+}
788+/* The inverse of bs_flush: prepare the bitstream to be written to again. */
789+static inline void bs_realign( bs_t *s )
790+{
791+    int offset = ((intptr_t)s->p & 3);
792+    if( offset )
793+    {
794+        s->p       = (uint8_t*)s->p - offset;
795+        s->i_left  = (WORD_SIZE - offset)*8;
796+        s->cur_bits = endian_fix32( M32(s->p) );
797+        s->cur_bits >>= (4-offset)*8;
798+    }
799+}
800+
801+static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
802+{
803+    if( WORD_SIZE == 8 )
804+    {
805+        s->cur_bits = (s->cur_bits << i_count) | i_bits;
806+        s->i_left -= i_count;
807+        if( s->i_left <= 32 )
808+        {
809+#ifdef WORDS_BIGENDIAN
810+            M32( s->p ) = s->cur_bits >> (32 - s->i_left);
811+#else
812+            M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
813+#endif
814+            s->i_left += 32;
815+            s->p += 4;
816+        }
817+    }
818+    else
819+    {
820+        if( i_count < s->i_left )
821+        {
822+            s->cur_bits = (s->cur_bits << i_count) | i_bits;
823+            s->i_left -= i_count;
824+        }
825+        else
826+        {
827+            i_count -= s->i_left;
828+            s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
829+            M32( s->p ) = endian_fix( s->cur_bits );
830+            s->p += 4;
831+            s->cur_bits = i_bits;
832+            s->i_left = 32 - i_count;
833+        }
834+    }
835+}
836+
837+/* Special case to eliminate branch in normal bs_write. */
838+/* Golomb never writes an even-size code, so this is only used in slice headers. */
839+static inline void bs_write32( bs_t *s, uint32_t i_bits )
840+{
841+    bs_write( s, 16, i_bits >> 16 );
842+    bs_write( s, 16, i_bits );
843+}
844+
845+static inline void bs_write1( bs_t *s, uint32_t i_bit )
846+{
847+    s->cur_bits <<= 1;
848+    s->cur_bits |= i_bit;
849+    s->i_left--;
850+    if( s->i_left == WORD_SIZE*8-32 )
851+    {
852+        M32( s->p ) = endian_fix32( s->cur_bits );
853+        s->p += 4;
854+        s->i_left = WORD_SIZE*8;
855+    }
856+}
857+
858+static inline void bs_align_0( bs_t *s )
859+{
860+    bs_write( s, s->i_left&7, 0 );
861+    bs_flush( s );
862+}
863+static inline void bs_align_1( bs_t *s )
864+{
865+    bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
866+    bs_flush( s );
867+}
868+static inline void bs_align_10( bs_t *s )
869+{
870+    if( s->i_left&7 )
871+        bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
872+}
873+
874+/* golomb functions */
875+
876+static const uint8_t x264_ue_size_tab[256] =
877+{
878+     1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
879+     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
880+    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
881+    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
882+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
883+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
884+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
885+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
886+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
887+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
888+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
889+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
890+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
891+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
892+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
893+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
894+};
895+
896+static inline void bs_write_ue_big( bs_t *s, unsigned int val )
897+{
898+    int size = 0;
899+    int tmp = ++val;
900+    if( tmp >= 0x10000 )
901+    {
902+        size = 32;
903+        tmp >>= 16;
904+    }
905+    if( tmp >= 0x100 )
906+    {
907+        size += 16;
908+        tmp >>= 8;
909+    }
910+    size += x264_ue_size_tab[tmp];
911+    bs_write( s, size>>1, 0 );
912+    bs_write( s, (size>>1)+1, val );
913+}
914+
915+/* Only works on values under 255. */
916+static inline void bs_write_ue( bs_t *s, int val )
917+{
918+    bs_write( s, x264_ue_size_tab[val+1], val+1 );
919+}
920+
921+static inline void bs_write_se( bs_t *s, int val )
922+{
923+    int size = 0;
924+    /* Faster than (val <= 0 ? -val*2+1 : val*2) */
925+    /* 4 instructions on x86, 3 on ARM */
926+    int tmp = 1 - val*2;
927+    if( tmp < 0 ) tmp = val*2;
928+    val = tmp;
929+
930+    if( tmp >= 0x100 )
931+    {
932+        size = 16;
933+        tmp >>= 8;
934+    }
935+    size += x264_ue_size_tab[tmp];
936+    bs_write( s, size, val );
937+}
938+
939+static inline void bs_write_te( bs_t *s, int x, int val )
940+{
941+    if( x == 1 )
942+        bs_write1( s, 1^val );
943+    else //if( x > 1 )
944+        bs_write_ue( s, val );
945+}
946+
947+static inline void bs_rbsp_trailing( bs_t *s )
948+{
949+    bs_write1( s, 1 );
950+    bs_write( s, s->i_left&7, 0  );
951+}
952+
953+static ALWAYS_INLINE int bs_size_ue( unsigned int val )
954+{
955+    return x264_ue_size_tab[val+1];
956+}
957+
958+static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
959+{
960+    if( val < 255 )
961+        return x264_ue_size_tab[val+1];
962+    else
963+        return x264_ue_size_tab[(val+1)>>8] + 16;
964+}
965+
966+static ALWAYS_INLINE int bs_size_se( int val )
967+{
968+    int tmp = 1 - val*2;
969+    if( tmp < 0 ) tmp = val*2;
970+    if( tmp < 256 )
971+        return x264_ue_size_tab[tmp];
972+    else
973+        return x264_ue_size_tab[tmp>>8]+16;
974+}
975+
976+static ALWAYS_INLINE int bs_size_te( int x, int val )
977+{
978+    if( x == 1 )
979+        return 1;
980+    else //if( x > 1 )
981+        return x264_ue_size_tab[val+1];
982+}
983+
984+#endif
985diff --git a/common/bs.h b/common/bs.h
986deleted file mode 100644
987index 343a3c9..0000000
988--- a/common/bs.h
989+++ /dev/null
990@@ -1,291 +0,0 @@
991-/*****************************************************************************
992- * bs.h :
993- *****************************************************************************
994- * Copyright (C) 2003-2008 x264 project
995- *
996- * Authors: Loren Merritt <lorenm@u.washington.edu>
997- *          Jason Garrett-Glaser <darkshikari@gmail.com>
998- *          Laurent Aimar <fenrir@via.ecp.fr>
999- *
1000- * This program is free software; you can redistribute it and/or modify
1001- * it under the terms of the GNU General Public License as published by
1002- * the Free Software Foundation; either version 2 of the License, or
1003- * (at your option) any later version.
1004- *
1005- * This program is distributed in the hope that it will be useful,
1006- * but WITHOUT ANY WARRANTY; without even the implied warranty of
1007- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1008- * GNU General Public License for more details.
1009- *
1010- * You should have received a copy of the GNU General Public License
1011- * along with this program; if not, write to the Free Software
1012- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
1013- *****************************************************************************/
1014-
1015-#ifndef X264_BS_H
1016-#define X264_BS_H
1017-
1018-typedef struct
1019-{
1020-    uint8_t i_bits;
1021-    uint8_t i_size;
1022-} vlc_t;
1023-
1024-typedef struct
1025-{
1026-    uint16_t i_bits;
1027-    uint8_t  i_size;
1028-    /* Next level table to use */
1029-    uint8_t  i_next;
1030-} vlc_large_t;
1031-
1032-typedef struct bs_s
1033-{
1034-    uint8_t *p_start;
1035-    uint8_t *p;
1036-    uint8_t *p_end;
1037-
1038-    intptr_t cur_bits;
1039-    int     i_left;    /* i_count number of available bits */
1040-    int     i_bits_encoded; /* RD only */
1041-} bs_t;
1042-
1043-typedef struct
1044-{
1045-    int     last;
1046-    int16_t level[16];
1047-    uint8_t run[16];
1048-} x264_run_level_t;
1049-
1050-extern const vlc_t x264_coeff0_token[5];
1051-extern const vlc_t x264_coeff_token[5][16][4];
1052-extern const vlc_t x264_total_zeros[15][16];
1053-extern const vlc_t x264_total_zeros_dc[3][4];
1054-extern const vlc_t x264_run_before[7][16];
1055-
1056-/* A larger level table size theoretically could help a bit at extremely
1057- * high bitrates, but the cost in cache is usually too high for it to be
1058- * useful.
1059- * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
1060- * FIXME: Do further testing? */
1061-#define LEVEL_TABLE_SIZE 128
1062-extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
1063-
1064-static inline void bs_init( bs_t *s, void *p_data, int i_data )
1065-{
1066-    int offset = ((intptr_t)p_data & 3);
1067-    s->p       = s->p_start = (uint8_t*)p_data - offset;
1068-    s->p_end   = (uint8_t*)p_data + i_data;
1069-    s->i_left  = (WORD_SIZE - offset)*8;
1070-    s->cur_bits = endian_fix32( M32(s->p) );
1071-    s->cur_bits >>= (4-offset)*8;
1072-}
1073-static inline int bs_pos( bs_t *s )
1074-{
1075-    return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
1076-}
1077-
1078-/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
1079-static inline void bs_flush( bs_t *s )
1080-{
1081-    M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
1082-    s->p += WORD_SIZE - s->i_left / 8;
1083-    s->i_left = WORD_SIZE*8;
1084-}
1085-/* The inverse of bs_flush: prepare the bitstream to be written to again. */
1086-static inline void bs_realign( bs_t *s )
1087-{
1088-    int offset = ((intptr_t)s->p & 3);
1089-    if( offset )
1090-    {
1091-        s->p       = (uint8_t*)s->p - offset;
1092-        s->i_left  = (WORD_SIZE - offset)*8;
1093-        s->cur_bits = endian_fix32( M32(s->p) );
1094-        s->cur_bits >>= (4-offset)*8;
1095-    }
1096-}
1097-
1098-static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
1099-{
1100-    if( WORD_SIZE == 8 )
1101-    {
1102-        s->cur_bits = (s->cur_bits << i_count) | i_bits;
1103-        s->i_left -= i_count;
1104-        if( s->i_left <= 32 )
1105-        {
1106-#ifdef WORDS_BIGENDIAN
1107-            M32( s->p ) = s->cur_bits >> (32 - s->i_left);
1108-#else
1109-            M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
1110-#endif
1111-            s->i_left += 32;
1112-            s->p += 4;
1113-        }
1114-    }
1115-    else
1116-    {
1117-        if( i_count < s->i_left )
1118-        {
1119-            s->cur_bits = (s->cur_bits << i_count) | i_bits;
1120-            s->i_left -= i_count;
1121-        }
1122-        else
1123-        {
1124-            i_count -= s->i_left;
1125-            s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
1126-            M32( s->p ) = endian_fix( s->cur_bits );
1127-            s->p += 4;
1128-            s->cur_bits = i_bits;
1129-            s->i_left = 32 - i_count;
1130-        }
1131-    }
1132-}
1133-
1134-/* Special case to eliminate branch in normal bs_write. */
1135-/* Golomb never writes an even-size code, so this is only used in slice headers. */
1136-static inline void bs_write32( bs_t *s, uint32_t i_bits )
1137-{
1138-    bs_write( s, 16, i_bits >> 16 );
1139-    bs_write( s, 16, i_bits );
1140-}
1141-
1142-static inline void bs_write1( bs_t *s, uint32_t i_bit )
1143-{
1144-    s->cur_bits <<= 1;
1145-    s->cur_bits |= i_bit;
1146-    s->i_left--;
1147-    if( s->i_left == WORD_SIZE*8-32 )
1148-    {
1149-        M32( s->p ) = endian_fix32( s->cur_bits );
1150-        s->p += 4;
1151-        s->i_left = WORD_SIZE*8;
1152-    }
1153-}
1154-
1155-static inline void bs_align_0( bs_t *s )
1156-{
1157-    bs_write( s, s->i_left&7, 0 );
1158-    bs_flush( s );
1159-}
1160-static inline void bs_align_1( bs_t *s )
1161-{
1162-    bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
1163-    bs_flush( s );
1164-}
1165-static inline void bs_align_10( bs_t *s )
1166-{
1167-    if( s->i_left&7 )
1168-        bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
1169-}
1170-
1171-/* golomb functions */
1172-
1173-static const uint8_t x264_ue_size_tab[256] =
1174-{
1175-     1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
1176-     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
1177-    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
1178-    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
1179-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1180-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1181-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1182-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1183-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1184-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1185-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1186-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1187-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1188-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1189-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1190-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1191-};
1192-
1193-static inline void bs_write_ue_big( bs_t *s, unsigned int val )
1194-{
1195-    int size = 0;
1196-    int tmp = ++val;
1197-    if( tmp >= 0x10000 )
1198-    {
1199-        size = 32;
1200-        tmp >>= 16;
1201-    }
1202-    if( tmp >= 0x100 )
1203-    {
1204-        size += 16;
1205-        tmp >>= 8;
1206-    }
1207-    size += x264_ue_size_tab[tmp];
1208-    bs_write( s, size>>1, 0 );
1209-    bs_write( s, (size>>1)+1, val );
1210-}
1211-
1212-/* Only works on values under 255. */
1213-static inline void bs_write_ue( bs_t *s, int val )
1214-{
1215-    bs_write( s, x264_ue_size_tab[val+1], val+1 );
1216-}
1217-
1218-static inline void bs_write_se( bs_t *s, int val )
1219-{
1220-    int size = 0;
1221-    /* Faster than (val <= 0 ? -val*2+1 : val*2) */
1222-    /* 4 instructions on x86, 3 on ARM */
1223-    int tmp = 1 - val*2;
1224-    if( tmp < 0 ) tmp = val*2;
1225-    val = tmp;
1226-
1227-    if( tmp >= 0x100 )
1228-    {
1229-        size = 16;
1230-        tmp >>= 8;
1231-    }
1232-    size += x264_ue_size_tab[tmp];
1233-    bs_write( s, size, val );
1234-}
1235-
1236-static inline void bs_write_te( bs_t *s, int x, int val )
1237-{
1238-    if( x == 1 )
1239-        bs_write1( s, 1^val );
1240-    else //if( x > 1 )
1241-        bs_write_ue( s, val );
1242-}
1243-
1244-static inline void bs_rbsp_trailing( bs_t *s )
1245-{
1246-    bs_write1( s, 1 );
1247-    bs_write( s, s->i_left&7, 0  );
1248-}
1249-
1250-static ALWAYS_INLINE int bs_size_ue( unsigned int val )
1251-{
1252-    return x264_ue_size_tab[val+1];
1253-}
1254-
1255-static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
1256-{
1257-    if( val < 255 )
1258-        return x264_ue_size_tab[val+1];
1259-    else
1260-        return x264_ue_size_tab[(val+1)>>8] + 16;
1261-}
1262-
1263-static ALWAYS_INLINE int bs_size_se( int val )
1264-{
1265-    int tmp = 1 - val*2;
1266-    if( tmp < 0 ) tmp = val*2;
1267-    if( tmp < 256 )
1268-        return x264_ue_size_tab[tmp];
1269-    else
1270-        return x264_ue_size_tab[tmp>>8]+16;
1271-}
1272-
1273-static ALWAYS_INLINE int bs_size_te( int x, int val )
1274-{
1275-    if( x == 1 )
1276-        return 1;
1277-    else //if( x > 1 )
1278-        return x264_ue_size_tab[val+1];
1279-}
1280-
1281-#endif
1282diff --git a/common/common.c b/common/common.c
1283index fccf2b0..2458f65 100644
1284--- a/common/common.c
1285+++ b/common/common.c
1286@@ -1027,60 +1027,6 @@ void x264_picture_clean( x264_picture_t *pic )
1287 }
1288 
1289 /****************************************************************************
1290- * x264_nal_encode:
1291- ****************************************************************************/
1292-int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode )
1293-{
1294-    uint8_t *src = nal->p_payload;
1295-    uint8_t *end = nal->p_payload + nal->i_payload;
1296-    uint8_t *orig_dst = dst;
1297-    int i_count = 0, size;
1298-
1299-    if( b_annexb )
1300-    {
1301-        if( b_long_startcode )
1302-            *dst++ = 0x00;
1303-        *dst++ = 0x00;
1304-        *dst++ = 0x00;
1305-        *dst++ = 0x01;
1306-    }
1307-    else /* save room for size later */
1308-        dst += 4;
1309-
1310-    /* nal header */
1311-    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
1312-
1313-    while( src < end )
1314-    {
1315-        if( i_count == 2 && *src <= 0x03 )
1316-        {
1317-            *dst++ = 0x03;
1318-            i_count = 0;
1319-        }
1320-        if( *src == 0 )
1321-            i_count++;
1322-        else
1323-            i_count = 0;
1324-        *dst++ = *src++;
1325-    }
1326-    size = (dst - orig_dst) - 4;
1327-
1328-    /* Write the size header for mp4/etc */
1329-    if( !b_annexb )
1330-    {
1331-        /* Size doesn't include the size of the header we're writing now. */
1332-        orig_dst[0] = size>>24;
1333-        orig_dst[1] = size>>16;
1334-        orig_dst[2] = size>> 8;
1335-        orig_dst[3] = size>> 0;
1336-    }
1337-
1338-    return size+4;
1339-}
1340-
1341-
1342-
1343-/****************************************************************************
1344  * x264_malloc:
1345  ****************************************************************************/
1346 void *x264_malloc( int i_size )
1347diff --git a/common/common.h b/common/common.h
1348index 539ea65..93712fe 100644
1349--- a/common/common.h
1350+++ b/common/common.h
1351@@ -137,7 +137,7 @@ static const int x264_scan8[16+2*4+3] =
1352 */
1353 
1354 #include "x264.h"
1355-#include "bs.h"
1356+#include "bitstream.h"
1357 #include "set.h"
1358 #include "predict.h"
1359 #include "pixel.h"
1360@@ -166,8 +166,6 @@ int64_t x264_mdate( void );
1361  * the encoding options */
1362 char *x264_param2string( x264_param_t *p, int b_res );
1363 
1364-int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode );
1365-
1366 /* log */
1367 void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
1368 
1369@@ -796,6 +794,7 @@ struct x264_t
1370     x264_zigzag_function_t zigzagf;
1371     x264_quant_function_t quantf;
1372     x264_deblock_function_t loopf;
1373+    x264_bitstream_function_t bsf;
1374 
1375 #ifdef HAVE_VISUALIZE
1376     struct visualize_t *visualize;
1377diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
1378new file mode 100644
1379index 0000000..1fb4cea
1380--- /dev/null
1381+++ b/common/x86/bitstream-a.asm
1382@@ -0,0 +1,112 @@
1383+;*****************************************************************************
1384+;* bitstream-a.asm: h264 encoder library
1385+;*****************************************************************************
1386+;* Copyright (C) 2010 x264 project
1387+;*
1388+;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
1389+;*
1390+;* This program is free software; you can redistribute it and/or modify
1391+;* it under the terms of the GNU General Public License as published by
1392+;* the Free Software Foundation; either version 2 of the License, or
1393+;* (at your option) any later version.
1394+;*
1395+;* This program is distributed in the hope that it will be useful,
1396+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
1397+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1398+;* GNU General Public License for more details.
1399+;*
1400+;* You should have received a copy of the GNU General Public License
1401+;* along with this program; if not, write to the Free Software
1402+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
1403+;*****************************************************************************
1404+
1405+%include "x86inc.asm"
1406+%include "x86util.asm"
1407+
1408+SECTION .text
1409+
1410+;-----------------------------------------------------------------------------
1411+; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
1412+;-----------------------------------------------------------------------------
1413+
1414+%macro NAL_LOOP 2
1415+ALIGN 16
1416+%1:
1417+    mova      m0, [r1+r2]
1418+    mova      m1, m0
1419+%if mmsize == 8
1420+    psrlq     m0, 8
1421+%else
1422+    psrldq    m0, 1
1423+%endif
1424+    %2   [r0+r1], m1
1425+    por       m1, m0
1426+    pcmpeqb   m1, m2
1427+    pmovmskb r3d, m1
1428+    test     r3d, r3d
1429+    jnz .escape
1430+    add       r1, mmsize
1431+    jl %1
1432+%endmacro
1433+
1434+%macro NAL_ESCAPE 1
1435+
1436+cglobal nal_escape_%1, 3,5
1437+    pxor      m2, m2
1438+    sub       r1, r2 ; r1 = offset of current src pointer from end of src
1439+    sub       r0, r1 ; r0 = projected end of dst, assuming no more escapes
1440+
1441+    mov      r3w, [r1+r2]
1442+    mov  [r0+r1], r3w
1443+    add       r1, 2
1444+    jge .ret
1445+
1446+    ; Start off by jumping into the escape loop in
1447+    ; case there's an escape at the start.
1448+    ; And do a few more in scalar until src is aligned again.
1449+    lea      r4d, [r1+r2]
1450+    or       r4d, -mmsize
1451+    neg      r4d
1452+    jmp .escapeloop
1453+
1454+    NAL_LOOP .loop_aligned, mova
1455+%if mmsize==16
1456+    NAL_LOOP .loop_unaligned, movu
1457+%endif
1458+
1459+.ret:
1460+    movifnidn rax, r0
1461+    RET
1462+ALIGN 16
1463+.escape:
1464+    mov      r4d, mmsize
1465+.escapeloop:
1466+    mov      r3b, [r1+r2]
1467+    cmp      r3b, 3
1468+    jna .escape_check
1469+.copy:
1470+    mov  [r0+r1], r3b
1471+    inc      r1
1472+    jge .ret
1473+    dec      r4d
1474+    jg .escapeloop
1475+    cmp byte [r1+r2-1], 0 ; Don't go back to the main loop until we're out of a zero-run.
1476+    jz .escape
1477+%if mmsize==16
1478+    lea      r4d, [r0+r1]
1479+    test     r4d, mmsize-1
1480+    jnz .loop_unaligned
1481+%endif
1482+    jmp .loop_aligned
1483+.escape_check:
1484+    cmp word [r0+r1-2], 0
1485+    jnz .copy
1486+    mov byte [r0+r1], 3
1487+    inc      r0
1488+    jmp .copy
1489+%endmacro
1490+
1491+INIT_MMX
1492+NAL_ESCAPE mmxext
1493+INIT_XMM
1494+NAL_ESCAPE sse2
1495diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
1496index aedd688..3a31e26 100644
1497--- a/common/x86/deblock-a.asm
1498+++ b/common/x86/deblock-a.asm
1499@@ -4,6 +4,7 @@
1500 ;* Copyright (C) 2005-2008 x264 project
1501 ;*
1502 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
1503+;*          Jason Garrett-Glaser <darkshikari@gmail.com>
1504 ;*
1505 ;* This program is free software; you can redistribute it and/or modify
1506 ;* it under the terms of the GNU General Public License as published by
1507diff --git a/encoder/encoder.c b/encoder/encoder.c
1508index 6e0dc54..32db82a 100644
1509--- a/encoder/encoder.c
1510+++ b/encoder/encoder.c
1511@@ -986,6 +986,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
1512     x264_mc_init( h->param.cpu, &h->mc );
1513     x264_quant_init( h, h->param.cpu, &h->quantf );
1514     x264_deblock_init( h->param.cpu, &h->loopf );
1515+    x264_bitstream_init( h->param.cpu, &h->bsf );
1516     x264_dct_init_weights();
1517 
1518     mbcmp_init( h );
1519@@ -1272,7 +1273,7 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start )
1520     for( int i = start; i < h->out.i_nal; i++ )
1521     {
1522         int long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS;
1523-        int size = x264_nal_encode( nal_buffer, &h->out.nal[i], h->param.b_annexb, long_startcode );
1524+        int size = x264_nal_encode( h, nal_buffer, &h->out.nal[i], long_startcode );
1525         h->out.nal[i].i_payload = size;
1526         h->out.nal[i].p_payload = nal_buffer;
1527         nal_buffer += size;
1528diff --git a/tools/checkasm.c b/tools/checkasm.c
1529index a0a9d54..ea6f209 100644
1530--- a/tools/checkasm.c
1531+++ b/tools/checkasm.c
1532@@ -1661,6 +1661,55 @@ static int check_cabac( int cpu_ref, int cpu_new )
1533     return ret;
1534 }
1535 
1536+static int check_bitstream( int cpu_ref, int cpu_new )
1537+{
1538+    x264_bitstream_function_t bs_c;
1539+    x264_bitstream_function_t bs_ref;
1540+    x264_bitstream_function_t bs_a;
1541+
1542+    int ret = 0, ok = 1, used_asm = 0;
1543+
1544+    x264_bitstream_init( 0, &bs_c );
1545+    x264_bitstream_init( cpu_ref, &bs_ref );
1546+    x264_bitstream_init( cpu_new, &bs_a );
1547+    if( bs_a.nal_escape != bs_ref.nal_escape )
1548+    {
1549+        int size = 0x4000;
1550+        uint8_t *input = malloc(size+100);
1551+        uint8_t *output1 = malloc(size*2);
1552+        uint8_t *output2 = malloc(size*2);
1553+        used_asm = 1;
1554+        set_func_name( "nal_escape" );
1555+        for( int i = 0; i < 100; i++ )
1556+        {
1557+            /* Test corner-case sizes */
1558+            int test_size = i < 10 ? i+1 : rand() & 0x3fff;
1559+            for( int j = 0; j < test_size; j++ )
1560+                input[j] = (rand()&1) * rand();
1561+            uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
1562+            uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
1563+            int size_c = end_c-output1;
1564+            int size_a = end_a-output2;
1565+            if( size_c != size_a || memcmp( output1, output2, size_c ) )
1566+            {
1567+                fprintf( stderr, "nal_escape :  [FAILED] %d %d\n", size_c, size_a );
1568+                ok = 0;
1569+                break;
1570+            }
1571+        }
1572+        for( int j = 0; j < size; j++ )
1573+            input[j] = rand();
1574+        call_c2( bs_c.nal_escape, output1, input, input+size );
1575+        call_a2( bs_a.nal_escape, output2, input, input+size );
1576+        free(input);
1577+        free(output1);
1578+        free(output2);
1579+    }
1580+    report( "nal escape:" );
1581+
1582+    return ret;
1583+}
1584+
1585 static int check_all_funcs( int cpu_ref, int cpu_new )
1586 {
1587     return check_pixel( cpu_ref, cpu_new )
1588@@ -1669,7 +1718,8 @@ static int check_all_funcs( int cpu_ref, int cpu_new )
1589          + check_intra( cpu_ref, cpu_new )
1590          + check_deblock( cpu_ref, cpu_new )
1591          + check_quant( cpu_ref, cpu_new )
1592-         + check_cabac( cpu_ref, cpu_new );
1593+         + check_cabac( cpu_ref, cpu_new )
1594+         + check_bitstream( cpu_ref, cpu_new );
1595 }
1596 
1597 static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
1598-- 
15991.7.0.4
1600
1601
1602From 790c0bcb4d96894969ab3dab6df670eafcbbcd85 Mon Sep 17 00:00:00 2001
1603From: Jason Garrett-Glaser <darkshikari@gmail.com>
1604Date: Fri, 28 May 2010 14:30:07 -0700
1605Subject: [PATCH 07/10] Re-enable i8x8 merged SATD
1606 Accidentally got disabled when intra_sad_x3 was added.
1607
1608---
1609 encoder/encoder.c |    1 +
1610 1 files changed, 1 insertions(+), 0 deletions(-)
1611
1612diff --git a/encoder/encoder.c b/encoder/encoder.c
1613index 32db82a..2f9e7f6 100644
1614--- a/encoder/encoder.c
1615+++ b/encoder/encoder.c
1616@@ -810,6 +810,7 @@ static void mbcmp_init( x264_t *h )
1617     memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
1618     h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
1619     h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
1620+    h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8;
1621     h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
1622     satd &= h->param.analyse.i_me_method == X264_ME_TESA;
1623     memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
1624-- 
16251.7.0.4
1626
1627
1628From 6e549ed124a0a84d77c51baa39984fb36ab49123 Mon Sep 17 00:00:00 2001
1629From: Jason Garrett-Glaser <darkshikari@gmail.com>
1630Date: Fri, 28 May 2010 14:27:22 -0700
1631Subject: [PATCH 08/10] Add API tool to apply arbitrary quantizer offsets
1632 The calling application can now pass a "map" of quantizer offsets to apply to each frame.
1633 An optional callback to free the map can also be included.
1634 This allows all kinds of flexible region-of-interest coding and similar.
1635
1636---
1637 common/common.c       |    2 +-
1638 encoder/encoder.c     |    7 +++++--
1639 encoder/ratecontrol.c |   36 +++++++++++++++++++++++++-----------
1640 encoder/ratecontrol.h |    4 ++--
1641 x264.h                |   20 +++++++++++++++++++-
1642 5 files changed, 52 insertions(+), 17 deletions(-)
1643
1644diff --git a/common/common.c b/common/common.c
1645index 2458f65..48e1bbc 100644
1646--- a/common/common.c
1647+++ b/common/common.c
1648@@ -998,6 +998,7 @@ static void x264_log_default( void *p_unused, int i_level, const char *psz_fmt,
1649  ****************************************************************************/
1650 int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
1651 {
1652+    memset( pic, 0, sizeof( x264_picture_t ) );
1653     pic->i_type = X264_TYPE_AUTO;
1654     pic->i_qpplus1 = 0;
1655     pic->img.i_csp = i_csp;
1656@@ -1010,7 +1011,6 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
1657     pic->img.i_stride[0] = i_width;
1658     pic->img.i_stride[1] = i_width / 2;
1659     pic->img.i_stride[2] = i_width / 2;
1660-    pic->param = NULL;
1661     pic->i_pic_struct = PIC_STRUCT_AUTO;
1662     return 0;
1663 }
1664diff --git a/encoder/encoder.c b/encoder/encoder.c
1665index 2f9e7f6..89107a3 100644
1666--- a/encoder/encoder.c
1667+++ b/encoder/encoder.c
1668@@ -2250,11 +2250,14 @@ int     x264_encoder_encode( x264_t *h,
1669 
1670         if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
1671         {
1672-            if( x264_macroblock_tree_read( h, fenc ) )
1673+            if( x264_macroblock_tree_read( h, fenc, pic_in->prop.quant_offsets ) )
1674                 return -1;
1675         }
1676         else
1677-            x264_adaptive_quant_frame( h, fenc );
1678+            x264_adaptive_quant_frame( h, fenc, pic_in->prop.quant_offsets );
1679+
1680+        if( pic_in->prop.quant_offsets_free )
1681+            pic_in->prop.quant_offsets_free( pic_in->prop.quant_offsets );
1682 
1683         if( h->frames.b_have_lowres )
1684             x264_frame_init_lowres( h, fenc );
1685diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
1686index bf0a400..d09de98 100644
1687--- a/encoder/ratecontrol.c
1688+++ b/encoder/ratecontrol.c
1689@@ -235,7 +235,7 @@ static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_
1690     return var;
1691 }
1692 
1693-void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
1694+void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets )
1695 {
1696     /* constants chosen to result in approximately the same overall bitrate as without AQ.
1697      * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
1698@@ -256,11 +256,22 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
1699         /* Need to init it anyways for MB tree */
1700         if( h->param.rc.f_aq_strength == 0 )
1701         {
1702-            memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
1703-            memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
1704-            if( h->frames.b_have_lowres )
1705+            if( quant_offsets )
1706+            {
1707                 for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
1708-                    frame->i_inv_qscale_factor[mb_xy] = 256;
1709+                    frame->f_qp_offset[mb_xy] = frame->f_qp_offset_aq[mb_xy] = quant_offsets[mb_xy];
1710+                if( h->frames.b_have_lowres )
1711+                    for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
1712+                        frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8( frame->f_qp_offset[mb_xy] );
1713+            }
1714+            else
1715+            {
1716+                memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
1717+                memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
1718+                if( h->frames.b_have_lowres )
1719+                    for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
1720+                        frame->i_inv_qscale_factor[mb_xy] = 256;
1721+            }
1722         }
1723         /* Need variance data for weighted prediction */
1724         if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
1725@@ -299,9 +310,10 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
1726             for( int mb_x = 0; mb_x < width; mb_x++ )
1727             {
1728                 float qp_adj;
1729+                int mb_xy = mb_x + mb_y*h->mb.i_mb_stride;
1730                 if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
1731                 {
1732-                    qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
1733+                    qp_adj = frame->f_qp_offset[mb_xy];
1734                     qp_adj = strength * (qp_adj - avg_adj);
1735                 }
1736                 else
1737@@ -309,10 +321,12 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
1738                     uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
1739                     qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
1740                 }
1741-                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
1742-                frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
1743+                if( quant_offsets )
1744+                    qp_adj += quant_offsets[mb_xy];
1745+                frame->f_qp_offset[mb_xy] =
1746+                frame->f_qp_offset_aq[mb_xy] = qp_adj;
1747                 if( h->frames.b_have_lowres )
1748-                    frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
1749+                    frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8(qp_adj);
1750             }
1751     }
1752 
1753@@ -327,7 +341,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
1754     }
1755 }
1756 
1757-int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
1758+int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets )
1759 {
1760     x264_ratecontrol_t *rc = h->rc;
1761     uint8_t i_type_actual = rc->entry[frame->i_frame].pict_type;
1762@@ -363,7 +377,7 @@ int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
1763         rc->qpbuf_pos--;
1764     }
1765     else
1766-        x264_adaptive_quant_frame( h, frame );
1767+        x264_adaptive_quant_frame( h, frame, quant_offsets );
1768     return 0;
1769 fail:
1770     x264_log(h, X264_LOG_ERROR, "Incomplete MB-tree stats file.\n");
1771diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
1772index e052b2a..dd139eb 100644
1773--- a/encoder/ratecontrol.h
1774+++ b/encoder/ratecontrol.h
1775@@ -29,8 +29,8 @@ void x264_ratecontrol_delete( x264_t * );
1776 
1777 void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
1778 
1779-void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
1780-int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
1781+void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets );
1782+int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets );
1783 int  x264_reference_build_list_optimal( x264_t *h );
1784 void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
1785 void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead );
1786diff --git a/x264.h b/x264.h
1787index 95efd88..a4b3400 100644
1788--- a/x264.h
1789+++ b/x264.h
1790@@ -35,7 +35,7 @@
1791 
1792 #include <stdarg.h>
1793 
1794-#define X264_BUILD 96
1795+#define X264_BUILD 97
1796 
1797 /* x264_t:
1798  *      opaque handler for encoder */
1799@@ -508,6 +508,22 @@ typedef struct
1800 
1801 typedef struct
1802 {
1803+    /* In: an array of quantizer offsets to be applied to this image during encoding.
1804+     *     These are added on top of the decisions made by x264.
1805+     *     Offsets can be fractional; they are added before QPs are rounded to integer.
1806+     *     Adaptive quantization must be enabled to use this feature.  Behavior if quant
1807+     *     offsets differ between encoding passes is undefined.
1808+     *
1809+     *     Array contains one offset per macroblock, in raster scan order.  In interlaced
1810+     *     mode, top-field MBs and bottom-field MBs are interleaved at the row level. */
1811+    float *quant_offsets;
1812+    /* In: optional callback to free quant_offsets when used.
1813+     *     Useful if one wants to use a different quant_offset array for each frame. */
1814+    void (*quant_offsets_free)( void* );
1815+} x264_image_properties_t;
1816+
1817+typedef struct
1818+{
1819     /* In: force picture type (if not auto)
1820      *     If x264 encoding parameters are violated in the forcing of picture types,
1821      *     x264 will correct the input picture type and log a warning.
1822@@ -537,6 +553,8 @@ typedef struct
1823     x264_param_t *param;
1824     /* In: raw data */
1825     x264_image_t img;
1826+    /* In: optional information to modify encoder decisions for this frame */
1827+    x264_image_properties_t prop;
1828     /* Out: HRD timing information. Output only when i_nal_hrd is set. */
1829     x264_hrd_t hrd_timing;
1830     /* private user data. libx264 doesn't touch this,
1831-- 
18321.7.0.4
1833
1834
1835From ef05902684b7f2fdfcb07b900740b61248a097e1 Mon Sep 17 00:00:00 2001
1836From: Henrik Gramner <hengar-6@student.ltu.se>
1837Date: Thu, 27 May 2010 22:18:38 +0200
1838Subject: [PATCH 09/10] Optimize out some x264_scan8 reads
1839
1840---
1841 encoder/analyse.c    |   15 ++++-----
1842 encoder/macroblock.c |   82 ++++++++++++++++++++++++++++++--------------------
1843 encoder/me.c         |   25 ++++++++-------
1844 3 files changed, 70 insertions(+), 52 deletions(-)
1845
1846diff --git a/encoder/analyse.c b/encoder/analyse.c
1847index a128a70..9e85e89 100644
1848--- a/encoder/analyse.c
1849+++ b/encoder/analyse.c
1850@@ -907,8 +907,6 @@ static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
1851 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1852 {
1853     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
1854-
1855-    int x, y;
1856     uint64_t i_satd, i_best;
1857     h->mb.i_skip_intra = 0;
1858 
1859@@ -1031,8 +1029,9 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1860             int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1861 
1862             i_best = COST_MAX64;
1863-            x = idx&1;
1864-            y = idx>>1;
1865+            int x = idx&1;
1866+            int y = idx>>1;
1867+            int s8 = X264_SCAN8_0 + 2*x + 16*y;
1868 
1869             p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1870             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1871@@ -1061,8 +1060,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1872                     if( !(idx&1) )
1873                         for( int j = 0; j < 7; j++ )
1874                             pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1875-                    i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1876-                    i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1877+                    i_nnz[0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8] );
1878+                    i_nnz[1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8] );
1879                 }
1880             }
1881             a->i_cbp_i8x8_luma = cbp_luma_new;
1882@@ -1070,8 +1069,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1883             if( !(idx&1) )
1884                 for( int j = 0; j < 7; j++ )
1885                     p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1886-            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1887-            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1888+            M16( &h->mb.cache.non_zero_count[s8 + 0*8] ) = i_nnz[0];
1889+            M16( &h->mb.cache.non_zero_count[s8 + 1*8] ) = i_nnz[1];
1890 
1891             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1892         }
1893diff --git a/encoder/macroblock.c b/encoder/macroblock.c
1894index 984f8a8..cdc4563 100644
1895--- a/encoder/macroblock.c
1896+++ b/encoder/macroblock.c
1897@@ -135,11 +135,12 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
1898     }
1899 }
1900 
1901-#define STORE_8x8_NNZ(idx,nz)\
1902+#define STORE_8x8_NNZ( s8, nz )\
1903+do\
1904 {\
1905-    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] ) = nz * 0x0101;\
1906-    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] ) = nz * 0x0101;\
1907-}
1908+    M16( &h->mb.cache.non_zero_count[(s8) + 0*8] ) = (nz) * 0x0101;\
1909+    M16( &h->mb.cache.non_zero_count[(s8) + 1*8] ) = (nz) * 0x0101;\
1910+} while(0)
1911 
1912 #define CLEAR_16x16_NNZ \
1913 {\
1914@@ -151,17 +152,18 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
1915 
1916 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
1917 {
1918-    int x = 8 * (idx&1);
1919-    int y = 8 * (idx>>1);
1920+    int x = idx&1;
1921+    int y = idx>>1;
1922+    int s8 = X264_SCAN8_0 + 2*x + 16*y;
1923     int nz;
1924-    uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
1925-    uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
1926+    uint8_t *p_src = &h->mb.pic.p_fenc[0][8*x + 8*y*FENC_STRIDE];
1927+    uint8_t *p_dst = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
1928     ALIGNED_ARRAY_16( int16_t, dct8x8,[64] );
1929 
1930     if( h->mb.b_lossless )
1931     {
1932         nz = h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
1933-        STORE_8x8_NNZ(idx,nz);
1934+        STORE_8x8_NNZ( s8, nz );
1935         h->mb.i_cbp_luma |= nz<<idx;
1936         return;
1937     }
1938@@ -175,10 +177,10 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
1939         h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
1940         h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
1941         h->dctf.add8x8_idct8( p_dst, dct8x8 );
1942-        STORE_8x8_NNZ(idx,1);
1943+        STORE_8x8_NNZ( s8, 1 );
1944     }
1945     else
1946-        STORE_8x8_NNZ(idx,0);
1947+        STORE_8x8_NNZ( s8, 0 );
1948 }
1949 
1950 static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
1951@@ -728,12 +730,13 @@ void x264_macroblock_encode( x264_t *h )
1952             if( h->mb.b_transform_8x8 )
1953                 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
1954                 {
1955-                    int x = 8*(i8x8&1);
1956-                    int y = 8*(i8x8>>1);
1957-                    nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
1958-                                        h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
1959-                                        h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
1960-                    STORE_8x8_NNZ(i8x8,nz);
1961+                    int x = i8x8&1;
1962+                    int y = i8x8>>1;
1963+                    int s8 = X264_SCAN8_0 + 2*x + 16*y;
1964+
1965+                    nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8], h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE,
1966+                                                                   h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE );
1967+                    STORE_8x8_NNZ( s8, nz );
1968                     h->mb.i_cbp_luma |= nz << i8x8;
1969                 }
1970             else
1971@@ -783,14 +786,18 @@ void x264_macroblock_encode( x264_t *h )
1972             {
1973                 for( int idx = 0; idx < 4; idx++ )
1974                 {
1975+                    int x = idx&1;
1976+                    int y = idx>>1;
1977+                    int s8 = X264_SCAN8_0 + 2*x + 16*y;
1978+
1979                     if( h->mb.i_cbp_luma&(1<<idx) )
1980                     {
1981                         h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
1982-                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
1983-                        STORE_8x8_NNZ(idx,1);
1984+                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
1985+                        STORE_8x8_NNZ( s8, 1 );
1986                     }
1987                     else
1988-                        STORE_8x8_NNZ(idx,0);
1989+                        STORE_8x8_NNZ( s8, 0 );
1990                 }
1991             }
1992         }
1993@@ -825,18 +832,24 @@ void x264_macroblock_encode( x264_t *h )
1994                     }
1995                 }
1996 
1997+                int x = i8x8&1;
1998+                int y = i8x8>>1;
1999+
2000                 /* decimate this 8x8 block */
2001                 i_decimate_mb += i_decimate_8x8;
2002                 if( b_decimate )
2003                 {
2004                     if( i_decimate_8x8 < 4 )
2005-                        STORE_8x8_NNZ(i8x8,0)
2006+                    {
2007+                        int s8 = X264_SCAN8_0 + 2*x + 16*y;
2008+                        STORE_8x8_NNZ( s8, 0 );
2009+                    }
2010                     else
2011                         h->mb.i_cbp_luma |= 1<<i8x8;
2012                 }
2013                 else if( cbp )
2014                 {
2015-                    h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
2016+                    h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
2017                     h->mb.i_cbp_luma |= 1<<i8x8;
2018                 }
2019             }
2020@@ -1045,8 +1058,11 @@ void x264_noise_reduction_update( x264_t *h )
2021 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
2022 {
2023     int i_qp = h->mb.i_qp;
2024-    uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
2025-    uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
2026+    int x = i8&1;
2027+    int y = i8>>1;
2028+    int s8 = X264_SCAN8_0 + 2*x + 16*y;
2029+    uint8_t *p_fenc = h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE;
2030+    uint8_t *p_fdec = h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE;
2031     int b_decimate = h->mb.b_dct_decimate;
2032     int nnz8x8 = 0;
2033     int nz;
2034@@ -1059,7 +1075,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
2035         if( h->mb.b_transform_8x8 )
2036         {
2037             nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
2038-            STORE_8x8_NNZ(i8,nnz8x8);
2039+            STORE_8x8_NNZ( s8, nnz8x8 );
2040         }
2041         else
2042         {
2043@@ -1075,8 +1091,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
2044         for( int ch = 0; ch < 2; ch++ )
2045         {
2046             int16_t dc;
2047-            p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
2048-            p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
2049+            p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
2050+            p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
2051             nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec, &dc );
2052             h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
2053         }
2054@@ -1099,13 +1115,13 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
2055                 {
2056                     h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
2057                     h->dctf.add8x8_idct8( p_fdec, dct8x8 );
2058-                    STORE_8x8_NNZ(i8,1);
2059+                    STORE_8x8_NNZ( s8, 1 );
2060                 }
2061                 else
2062-                    STORE_8x8_NNZ(i8,0);
2063+                    STORE_8x8_NNZ( s8, 0 );
2064             }
2065             else
2066-                STORE_8x8_NNZ(i8,0);
2067+                STORE_8x8_NNZ( s8, 0 );
2068         }
2069         else
2070         {
2071@@ -1132,7 +1148,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
2072             if( nnz8x8 )
2073                 h->dctf.add8x8_idct( p_fdec, dct4x4 );
2074             else
2075-                STORE_8x8_NNZ(i8,0);
2076+                STORE_8x8_NNZ( s8, 0 );
2077         }
2078 
2079         i_qp = h->mb.i_chroma_qp;
2080@@ -1140,8 +1156,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
2081         for( int ch = 0; ch < 2; ch++ )
2082         {
2083             ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
2084-            p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
2085-            p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
2086+            p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
2087+            p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
2088 
2089             h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
2090             dct4x4[0] = 0;
2091diff --git a/encoder/me.c b/encoder/me.c
2092index 77073cc..40d0650 100644
2093--- a/encoder/me.c
2094+++ b/encoder/me.c
2095@@ -937,8 +937,11 @@ int x264_iter_kludge = 0;
2096 
2097 static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
2098 {
2099-    int16_t *cache0_mv = h->mb.cache.mv[0][x264_scan8[i8*4]];
2100-    int16_t *cache1_mv = h->mb.cache.mv[1][x264_scan8[i8*4]];
2101+    int x = i8&1;
2102+    int y = i8>>1;
2103+    int s8 = X264_SCAN8_0 + 2*x + 16*y;
2104+    int16_t *cache0_mv = h->mb.cache.mv[0][s8];
2105+    int16_t *cache1_mv = h->mb.cache.mv[1][s8];
2106     const int i_pixel = m0->i_pixel;
2107     const int bw = x264_pixel_size[i_pixel].w;
2108     const int bh = x264_pixel_size[i_pixel].h;
2109@@ -946,11 +949,11 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
2110     ALIGNED_ARRAY_8( uint8_t, pixu_buf,[2],[9][8*8] );
2111     ALIGNED_ARRAY_8( uint8_t, pixv_buf,[2],[9][8*8] );
2112     uint8_t *src[2][9];
2113-    uint8_t *pix  = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8];
2114-    uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
2115-    uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
2116-    const int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]];
2117-    const int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]];
2118+    uint8_t *pix  = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
2119+    uint8_t *pixu = &h->mb.pic.p_fdec[1][4*x + 4*y*FDEC_STRIDE];
2120+    uint8_t *pixv = &h->mb.pic.p_fdec[2][4*x + 4*y*FDEC_STRIDE];
2121+    int ref0 = h->mb.cache.ref[0][s8];
2122+    int ref1 = h->mb.cache.ref[1][s8];
2123     const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
2124     const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
2125     int stride[2][9];
2126@@ -1058,13 +1061,13 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
2127 
2128     if( rd )
2129     {
2130-        x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
2131+        x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
2132         amvd = pack8to16( X264_MIN(abs(bm0x - m0->mvp[0]),33), X264_MIN(abs(bm0y - m0->mvp[1]),33) );
2133-        x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, amvd );
2134+        x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 0, amvd );
2135 
2136-        x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
2137+        x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
2138         amvd = pack8to16( X264_MIN(abs(bm1x - m1->mvp[0]),33), X264_MIN(abs(bm1y - m1->mvp[1]),33) );
2139-        x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, amvd );
2140+        x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 1, amvd );
2141     }
2142 
2143     m0->mv[0] = bm0x;
2144-- 
21451.7.0.4
2146
2147
2148From c949405e834a2cbe35f3fb460eae061447dc386b Mon Sep 17 00:00:00 2001
2149From: Henrik Gramner <hengar-6@student.ltu.se>
2150Date: Sun, 30 May 2010 22:45:14 +0200
2151Subject: [PATCH 10/10] Some deblocking-related optimizations
2152
2153---
2154 common/deblock.c    |    8 ++++----
2155 common/macroblock.c |   43 +++++++++++++++++++++++--------------------
2156 2 files changed, 27 insertions(+), 24 deletions(-)
2157
2158diff --git a/common/deblock.c b/common/deblock.c
2159index 27c73ae..3296dbf 100644
2160--- a/common/deblock.c
2161+++ b/common/deblock.c
2162@@ -299,7 +299,7 @@ static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2,
2163 void x264_frame_deblock_row( x264_t *h, int mb_y )
2164 {
2165     int b_interlaced = h->sh.b_mbaff;
2166-    int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
2167+    int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->param.analyse.i_chroma_qp_offset );
2168     int stridey   = h->fdec->i_stride[0];
2169     int stride2y  = stridey << b_interlaced;
2170     int strideuv  = h->fdec->i_stride[1];
2171@@ -318,7 +318,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
2172         uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
2173         uint8_t *pixu = h->fdec->plane[1] +  8*mb_y*strideuv +  8*mb_x;
2174         uint8_t *pixv = h->fdec->plane[2] +  8*mb_y*strideuv +  8*mb_x;
2175-        if( b_interlaced && (mb_y&1) )
2176+        if( mb_y & b_interlaced )
2177         {
2178             pixy -= 15*stridey;
2179             pixu -=  7*strideuv;
2180@@ -366,12 +366,12 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
2181             int qp_top = (qp + qpt + 1) >> 1;
2182             int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1;
2183             int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] );
2184-            if( !b_interlaced && (intra_cur || intra_top) )
2185+            if( ~b_interlaced & (intra_cur | intra_top) )
2186                 FILTER( _intra, 1, 0, qp_top, qpc_top );
2187             else
2188             {
2189                 if( intra_top )
2190-                    memset( bs[1][0], 3, sizeof(bs[1][0]) );
2191+                    M32( bs[1][0] ) = 0x03030303;
2192                 FILTER(       , 1, 0, qp_top, qpc_top );
2193             }
2194         }
2195diff --git a/common/macroblock.c b/common/macroblock.c
2196index 1b2d37b..7180e8f 100644
2197--- a/common/macroblock.c
2198+++ b/common/macroblock.c
2199@@ -400,9 +400,27 @@ void x264_macroblock_slice_init( x264_t *h )
2200                 }
2201         }
2202     }
2203-    if( h->sh.i_type == SLICE_TYPE_P )
2204+    else if( h->sh.i_type == SLICE_TYPE_P )
2205+    {
2206         memset( h->mb.cache.skip, 0, sizeof( h->mb.cache.skip ) );
2207 
2208+        if( h->sh.i_disable_deblocking_filter_idc != 1 && h->param.analyse.i_weighted_pred )
2209+        {
2210+            deblock_ref_table(-2) = -2;
2211+            deblock_ref_table(-1) = -1;
2212+            for( int i = 0; i < h->i_ref0 << h->sh.b_mbaff; i++ )
2213+            {
2214+                /* Mask off high bits to avoid frame num collisions with -1/-2.
2215+                 * In current x264 frame num values don't cover a range of more
2216+                 * than 32, so 6 bits is enough for uniqueness. */
2217+                if( !h->mb.b_interlaced )
2218+                    deblock_ref_table(i) = h->fref0[i]->i_frame_num&63;
2219+                else
2220+                    deblock_ref_table(i) = ((h->fref0[i>>1]->i_frame_num&63)<<1) + (i&1);
2221+            }
2222+        }
2223+    }
2224+
2225     /* init with not available (for top right idx=7,15) */
2226     memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) );
2227 
2228@@ -418,19 +436,6 @@ void x264_macroblock_slice_init( x264_t *h )
2229             h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta;
2230         }
2231 
2232-    deblock_ref_table(-2) = -2;
2233-    deblock_ref_table(-1) = -1;
2234-    for( int i = 0; i < h->i_ref0 << h->sh.b_mbaff; i++ )
2235-    {
2236-        /* Mask off high bits to avoid frame num collisions with -1/-2.
2237-         * In current x264 frame num values don't cover a range of more
2238-         * than 32, so 6 bits is enough for uniqueness. */
2239-        if( !h->mb.b_interlaced )
2240-            deblock_ref_table(i) = h->fref0[i]->i_frame_num&63;
2241-        else
2242-            deblock_ref_table(i) = ((h->fref0[i>>1]->i_frame_num&63)<<1) + (i&1);
2243-    }
2244-
2245     h->mb.i_neighbour4[6] =
2246     h->mb.i_neighbour4[9] =
2247     h->mb.i_neighbour4[12] =
2248@@ -894,7 +899,6 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
2249 void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
2250 {
2251     int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
2252-    int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
2253 
2254     h->mb.i_neighbour = 0;
2255     h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
2256@@ -906,9 +910,9 @@ void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_
2257             h->mb.i_neighbour |= MB_LEFT;
2258     }
2259 
2260-    if( top >= 0 )
2261+    if( mb_y > h->mb.b_interlaced )
2262     {
2263-        h->mb.i_mb_top_xy = top;
2264+        h->mb.i_mb_top_xy = h->mb.i_mb_xy - (h->mb.i_mb_stride << h->mb.b_interlaced);
2265         if( deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy] )
2266             h->mb.i_neighbour |= MB_TOP;
2267     }
2268@@ -930,8 +934,6 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
2269         h->mb.i_neighbour &= ~old_neighbour;
2270         if( h->mb.i_neighbour )
2271         {
2272-            int left = h->mb.i_mb_left_xy;
2273-            int top  = h->mb.i_mb_top_xy;
2274             int top_y = mb_y - (1 << h->mb.b_interlaced);
2275             int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
2276             int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
2277@@ -941,10 +943,11 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
2278             uint8_t (*nnz)[24] = h->mb.non_zero_count;
2279 
2280             if( h->mb.i_neighbour & MB_TOP )
2281-                CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
2282+                CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] );
2283 
2284             if( h->mb.i_neighbour & MB_LEFT )
2285             {
2286+                int left = h->mb.i_mb_left_xy;
2287                 h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
2288                 h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
2289                 h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
2290-- 
22911.7.0.4