ZQvB8JmP

· 8 years ago · May 28, 2017, 06:24 PM
1From 2bcbac357b714f468e0138f022e584ffdb42f6d2 Mon Sep 17 00:00:00 2001
2From: Jason Garrett-Glaser <darkshikari@gmail.com>
3Date: Mon, 31 May 2010 11:14:22 -0700
4Subject: [PATCH 01/11] Fix cavlc+deblock+8x8dct (regression in r1612)
5 Add cavlc+8x8dct munging to new deblock system.
6 May have caused minor visual artifacts.
7
8---
9 common/deblock.c    |   47 -----------------------------------------------
10 common/macroblock.c |   46 ++++++++++++++++++++++++++++++++++++++++++++--
11 2 files changed, 44 insertions(+), 49 deletions(-)
12
13diff --git a/common/deblock.c b/common/deblock.c
14index fc039c5..27c73ae 100644
15--- a/common/deblock.c
16+++ b/common/deblock.c
17@@ -24,46 +24,6 @@
18 
19 #include "common.h"
20 
21-/* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
22- * entropy coding, but per 64 coeffs for the purpose of deblocking */
23-static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
24-{
25-    uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
26-    int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
27-    for( int x = 0; x<h->sps->i_mb_width; x++ )
28-    {
29-        memcpy( buf+x, src+x, 16 );
30-        if( transform[x] )
31-        {
32-            int nnz = src[x][0] | src[x][1];
33-            src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
34-            nnz = src[x][2] | src[x][3];
35-            src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
36-        }
37-    }
38-}
39-
40-static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
41-{
42-    uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
43-    for( int x = 0; x < h->sps->i_mb_width; x++ )
44-        memcpy( dst+x, buf+x, 16 );
45-}
46-
47-static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
48-{
49-    func( h, mb_y, buf );
50-    if( mb_y > 0 )
51-        func( h, mb_y-1, buf + h->sps->i_mb_width );
52-    if( h->sh.b_mbaff )
53-    {
54-        func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
55-        if( mb_y > 0 )
56-            func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
57-    }
58-}
59-
60-
61 /* Deblocking filter */
62 static const uint8_t i_alpha_table[52+12*2] =
63 {
64@@ -344,10 +304,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
65     int stride2y  = stridey << b_interlaced;
66     int strideuv  = h->fdec->i_stride[1];
67     int stride2uv = strideuv << b_interlaced;
68-    uint8_t (*nnz_backup)[16] = h->scratch_buffer;
69-
70-    if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
71-        munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
72 
73     for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
74     {
75@@ -427,9 +383,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
76             if( !transform_8x8 ) FILTER( , 1, 3, qp, qpc );
77         }
78     }
79-
80-    if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
81-        munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
82 }
83 
84 #ifdef HAVE_MMX
85diff --git a/common/macroblock.c b/common/macroblock.c
86index ce510e9..01c90d2 100644
87--- a/common/macroblock.c
88+++ b/common/macroblock.c
89@@ -344,8 +344,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
90         int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
91         int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
92             ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
93-        int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
94-        scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_nnz );
95+        scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
96     }
97     int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
98     scratch_size = X264_MAX( scratch_size, buf_mbtree );
99@@ -1013,6 +1012,49 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
100         M32( &h->mb.cache.ref[0][x264_scan8[0]+8*2] ) = refbot;
101         M32( &h->mb.cache.ref[0][x264_scan8[0]+8*3] ) = refbot;
102     }
103+
104+    /* Munge NNZ for cavlc + 8x8dct */
105+    if( !h->param.b_cabac && h->pps->b_transform_8x8_mode )
106+    {
107+        uint8_t (*nnz)[24] = h->mb.non_zero_count;
108+        int top = h->mb.i_mb_top_xy;
109+        int left = h->mb.i_mb_left_xy;
110+
111+        if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] )
112+        {
113+            int i8 = x264_scan8[0] - 8;
114+            int nnz_top0 = M16( &nnz[top][8] ) | M16( &nnz[top][12] );
115+            int nnz_top1 = M16( &nnz[top][10] ) | M16( &nnz[top][14] );
116+            M16( &h->mb.cache.non_zero_count[i8+0] ) = nnz_top0 ? 0x0101 : 0;
117+            M16( &h->mb.cache.non_zero_count[i8+2] ) = nnz_top1 ? 0x0101 : 0;
118+        }
119+
120+        if( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left] )
121+        {
122+            int i8 = x264_scan8[0] - 1;
123+            int nnz_left0 = M16( &nnz[left][2] ) | M16( &nnz[left][6] );
124+            int nnz_left1 = M16( &nnz[left][10] ) | M16( &nnz[left][14] );
125+            h->mb.cache.non_zero_count[i8+8*0] = !!nnz_left0;
126+            h->mb.cache.non_zero_count[i8+8*1] = !!nnz_left0;
127+            h->mb.cache.non_zero_count[i8+8*2] = !!nnz_left1;
128+            h->mb.cache.non_zero_count[i8+8*3] = !!nnz_left1;
129+        }
130+
131+        if( h->mb.mb_transform_size[h->mb.i_mb_xy] )
132+        {
133+            int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
134+            int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] );
135+            int nnz2 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[10]] );
136+            int nnz3 = M16( &h->mb.cache.non_zero_count[x264_scan8[12]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[14]] );
137+            uint32_t nnztop = pack16to32( !!nnz0, !!nnz1 ) * 0x0101;
138+            uint32_t nnzbot = pack16to32( !!nnz2, !!nnz3 ) * 0x0101;
139+
140+            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*0] ) = nnztop;
141+            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*1] ) = nnztop;
142+            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*2] ) = nnzbot;
143+            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*3] ) = nnzbot;
144+        }
145+    }
146 }
147 
148 static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i )
149-- 
1501.7.0.4
151
152
153From d51fde592507649e22757a23f0ea0252ec35b5b6 Mon Sep 17 00:00:00 2001
154From: Anton Mitrofanov <BugMaster@narod.ru>
155Date: Mon, 31 May 2010 22:36:50 +0400
156Subject: [PATCH 02/11] Fix crash with MP4-muxing if zero frames were encoded
157
158---
159 output/mp4.c |    3 ++-
160 1 files changed, 2 insertions(+), 1 deletions(-)
161
162diff --git a/output/mp4.c b/output/mp4.c
163index f76541e..0aa5070 100644
164--- a/output/mp4.c
165+++ b/output/mp4.c
166@@ -112,6 +112,7 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
167         if( p_mp4->p_sample->data )
168             free( p_mp4->p_sample->data );
169 
170+        p_mp4->p_sample->dataLength = 0;
171         gf_isom_sample_del( &p_mp4->p_sample );
172     }
173 
174@@ -135,7 +136,7 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
175          * The reason is that an Edit Box maps the presentation time-line to the media time-line.
176          * Any demuxers should follow the Edit Box if it exists. */
177         GF_ISOSample *sample = gf_isom_get_sample_info( p_mp4->p_file, p_mp4->i_track, 1, NULL, NULL );
178-        if( sample->CTS_Offset > 0 )
179+        if( sample && sample->CTS_Offset > 0 )
180         {
181             uint32_t mvhd_timescale = gf_isom_get_timescale( p_mp4->p_file );
182             uint64_t tkhd_duration = (uint64_t)( mdhd_duration * ( (double)mvhd_timescale / p_mp4->i_time_res ) );
183-- 
1841.7.0.4
185
186
187From 8098997dcba2602b22b43fdf26621d08d3f81333 Mon Sep 17 00:00:00 2001
188From: Jason Garrett-Glaser <darkshikari@gmail.com>
189Date: Sun, 30 May 2010 09:42:53 -0700
190Subject: [PATCH 03/11] Fix ultrafast to actually turn off weightb
191
192---
193 common/common.c |    1 +
194 1 files changed, 1 insertions(+), 0 deletions(-)
195
196diff --git a/common/common.c b/common/common.c
197index 62bef99..fccf2b0 100644
198--- a/common/common.c
199+++ b/common/common.c
200@@ -183,6 +183,7 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
201         param->i_bframe_adaptive = X264_B_ADAPT_NONE;
202         param->rc.b_mb_tree = 0;
203         param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
204+        param->analyse.b_weighted_bipred = 0;
205     }
206     else if( !strcasecmp( preset, "superfast" ) )
207     {
208-- 
2091.7.0.4
210
211
212From a7f870990af39a11f3bb883b9335baad91909ccb Mon Sep 17 00:00:00 2001
213From: Jason Garrett-Glaser <darkshikari@gmail.com>
214Date: Thu, 27 May 2010 12:31:41 -0700
215Subject: [PATCH 04/11] Fix omission in libx264 tuning documentation
216
217---
218 x264.h |    2 +-
219 1 files changed, 1 insertions(+), 1 deletions(-)
220
221diff --git a/x264.h b/x264.h
222index 6d7b703..95efd88 100644
223--- a/x264.h
224+++ b/x264.h
225@@ -446,7 +446,7 @@ static const char * const x264_tune_names[] = { "film", "animation", "grain", "s
226 
227 /*      Multiple tunings can be used if separated by a delimiter in ",./-+",
228  *      however multiple psy tunings cannot be used.
229- *      film, animation, grain, psnr, and ssim are psy tunings.
230+ *      film, animation, grain, stillimage, psnr, and ssim are psy tunings.
231  *
232  *      returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
233 int     x264_param_default_preset( x264_param_t *, const char *preset, const char *tune );
234-- 
2351.7.0.4
236
237
238From 5832bdfaed3bcce1b2823b6594386e0357d8ff31 Mon Sep 17 00:00:00 2001
239From: Jason Garrett-Glaser <darkshikari@gmail.com>
240Date: Wed, 26 May 2010 12:55:35 -0700
241Subject: [PATCH 05/11] Merge some of adaptive quant and weightp
242 Eliminate redundant work; both of them were calculating variance of the frame.
243
244---
245 common/frame.h        |    4 +-
246 encoder/analyse.h     |    1 -
247 encoder/encoder.c     |   12 ++---
248 encoder/ratecontrol.c |  124 +++++++++++++++++++++++++++++++-----------------
249 encoder/slicetype.c   |   31 ++----------
250 5 files changed, 92 insertions(+), 80 deletions(-)
251
252diff --git a/common/frame.h b/common/frame.h
253index 91d27b5..ca5cb7a 100644
254--- a/common/frame.h
255+++ b/common/frame.h
256@@ -118,8 +118,8 @@ typedef struct x264_frame
257     uint16_t *i_inv_qscale_factor;
258     int     b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
259     float   f_weighted_cost_delta[X264_BFRAME_MAX+2];
260-    uint32_t i_pixel_sum;
261-    uint64_t i_pixel_ssd;
262+    uint32_t i_pixel_sum[3];
263+    uint64_t i_pixel_ssd[3];
264 
265     /* hrd */
266     x264_hrd_t hrd_timing;
267diff --git a/encoder/analyse.h b/encoder/analyse.h
268index 7c2c22c..53e4c2e 100644
269--- a/encoder/analyse.h
270+++ b/encoder/analyse.h
271@@ -33,7 +33,6 @@ void x264_slicetype_decide( x264_t *h );
272 void x264_slicetype_analyse( x264_t *h, int keyframe );
273 
274 int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
275-void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame );
276 
277 int  x264_lookahead_init( x264_t *h, int i_slicetype_length );
278 int  x264_lookahead_is_empty( x264_t *h );
279diff --git a/encoder/encoder.c b/encoder/encoder.c
280index 52017ff..6e0dc54 100644
281--- a/encoder/encoder.c
282+++ b/encoder/encoder.c
283@@ -2246,21 +2246,17 @@ int     x264_encoder_encode( x264_t *h,
284                 fenc->i_pic_struct = PIC_STRUCT_PROGRESSIVE;
285         }
286 
287-        if( h->frames.b_have_lowres )
288-        {
289-            if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
290-                x264_weight_plane_analyse( h, fenc );
291-            x264_frame_init_lowres( h, fenc );
292-        }
293-
294         if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
295         {
296             if( x264_macroblock_tree_read( h, fenc ) )
297                 return -1;
298         }
299-        else if( h->param.rc.i_aq_mode )
300+        else
301             x264_adaptive_quant_frame( h, fenc );
302 
303+        if( h->frames.b_have_lowres )
304+            x264_frame_init_lowres( h, fenc );
305+
306         /* 2: Place the frame into the queue for its slice type decision */
307         x264_lookahead_put_frame( h, fenc );
308 
309diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
310index a725a24..bf0a400 100644
311--- a/encoder/ratecontrol.c
312+++ b/encoder/ratecontrol.c
313@@ -215,12 +215,14 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
314     stride <<= h->mb.b_interlaced;
315     uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride );
316     uint32_t sum = (uint32_t)res;
317-    uint32_t sqr = res >> 32;
318-    return sqr - (sum * sum >> shift);
319+    uint32_t ssd = res >> 32;
320+    frame->i_pixel_sum[i] += sum;
321+    frame->i_pixel_ssd[i] += ssd;
322+    return ssd - (sum * sum >> shift);
323 }
324 
325 // Find the total AC energy of the block in all planes.
326-static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
327+static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
328 {
329     /* This function contains annoying hacks because GCC has a habit of reordering emms
330      * and putting it after floating point ops.  As a result, we put the emms at the end of the
331@@ -239,56 +241,90 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
332      * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
333     float strength;
334     float avg_adj = 0.f;
335-    /* Need to init it anyways for MB tree. */
336-    if( h->param.rc.f_aq_strength == 0 )
337-    {
338-        memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
339-        memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
340-        if( h->frames.b_have_lowres )
341-            for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
342-                frame->i_inv_qscale_factor[mb_xy] = 256;
343-        return;
344+    int width = h->sps->i_mb_width;
345+    int height = h->sps->i_mb_height;
346+    /* Initialize frame stats */
347+    for( int i = 0; i < 3; i++ )
348+    {
349+        frame->i_pixel_sum[i] = 0;
350+        frame->i_pixel_ssd[i] = 0;
351     }
352 
353-    if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
354+    /* Degenerate cases */
355+    if( h->param.rc.i_aq_mode == X264_AQ_NONE || h->param.rc.f_aq_strength == 0 )
356     {
357-        float avg_adj_pow2 = 0.f;
358-        for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
359-            for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
360-            {
361-                uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
362-                float qp_adj = powf( energy + 1, 0.125f );
363-                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
364-                avg_adj += qp_adj;
365-                avg_adj_pow2 += qp_adj * qp_adj;
366-            }
367-        avg_adj /= h->mb.i_mb_count;
368-        avg_adj_pow2 /= h->mb.i_mb_count;
369-        strength = h->param.rc.f_aq_strength * avg_adj;
370-        avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
371+        /* Need to init it anyways for MB tree */
372+        if( h->param.rc.f_aq_strength == 0 )
373+        {
374+            memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
375+            memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
376+            if( h->frames.b_have_lowres )
377+                for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
378+                    frame->i_inv_qscale_factor[mb_xy] = 256;
379+        }
380+        /* Need variance data for weighted prediction */
381+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
382+        {
383+            for( int mb_y = 0; mb_y < height; mb_y++ )
384+                for( int mb_x = 0; mb_x < width; mb_x++ )
385+                    x264_ac_energy_mb( h, mb_x, mb_y, frame );
386+        }
387+        else
388+            return;
389     }
390+    /* Actual adaptive quantization */
391     else
392-        strength = h->param.rc.f_aq_strength * 1.0397f;
393-
394-    for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
395-        for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
396+    {
397+        if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
398         {
399-            float qp_adj;
400-            if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
401-            {
402-                qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
403-                qp_adj = strength * (qp_adj - avg_adj);
404-            }
405-            else
406+            float avg_adj_pow2 = 0.f;
407+            for( int mb_y = 0; mb_y < height; mb_y++ )
408+                for( int mb_x = 0; mb_x < width; mb_x++ )
409+                {
410+                    uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
411+                    float qp_adj = powf( energy + 1, 0.125f );
412+                    frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
413+                    avg_adj += qp_adj;
414+                    avg_adj_pow2 += qp_adj * qp_adj;
415+                }
416+            avg_adj /= h->mb.i_mb_count;
417+            avg_adj_pow2 /= h->mb.i_mb_count;
418+            strength = h->param.rc.f_aq_strength * avg_adj;
419+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
420+        }
421+        else
422+            strength = h->param.rc.f_aq_strength * 1.0397f;
423+
424+        for( int mb_y = 0; mb_y < height; mb_y++ )
425+            for( int mb_x = 0; mb_x < width; mb_x++ )
426             {
427-                uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
428-                qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
429+                float qp_adj;
430+                if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
431+                {
432+                    qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
433+                    qp_adj = strength * (qp_adj - avg_adj);
434+                }
435+                else
436+                {
437+                    uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
438+                    qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
439+                }
440+                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
441+                frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
442+                if( h->frames.b_have_lowres )
443+                    frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
444             }
445-            frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
446-            frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
447-            if( h->frames.b_have_lowres )
448-                frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
449-        }
450+    }
451+
452+    /* Remove mean from SSD calculation */
453+    for( int i = 0; i < 3; i++ )
454+    {
455+        uint64_t ssd = frame->i_pixel_ssd[i];
456+        uint64_t sum = frame->i_pixel_sum[i];
457+        int w = width*16>>!!i;
458+        int h = height*16>>!!i;
459+        frame->i_pixel_ssd[i] = ssd - (sum * sum + w * h / 2) / (w * h);
460+    }
461 }
462 
463 int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
464diff --git a/encoder/slicetype.c b/encoder/slicetype.c
465index 9352367..e454e12 100644
466--- a/encoder/slicetype.c
467+++ b/encoder/slicetype.c
468@@ -67,25 +67,6 @@ static void x264_weight_get_h264( unsigned int weight_nonh264, int offset, x264_
469     w->i_scale = X264_MIN( w->i_scale, 127 );
470 }
471 
472-void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame )
473-{
474-    uint32_t sad = 0;
475-    uint64_t ssd = 0;
476-    uint8_t *p = frame->plane[0];
477-    int stride = frame->i_stride[0];
478-    int width = frame->i_width[0];
479-    int height = frame->i_lines[0];
480-    for( int y = 0; y < height>>4; y++, p += stride*16 )
481-        for( int x = 0; x < width; x += 16 )
482-        {
483-            uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride );
484-            sad += (uint32_t)res;
485-            ssd += res >> 32;
486-        }
487-    frame->i_pixel_sum = sad;
488-    frame->i_pixel_ssd = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
489-}
490-
491 static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, uint8_t *dest )
492 {
493     int ref0_distance = fenc->i_frame - ref->i_frame - 1;
494@@ -167,10 +148,10 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
495     int found;
496     x264_weight_t *weights = fenc->weight[0];
497 
498-    fenc_var = round( sqrt( fenc->i_pixel_ssd ) );
499-    ref_var  = round( sqrt(  ref->i_pixel_ssd ) );
500-    fenc_mean = (float)fenc->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
501-    ref_mean  = (float) ref->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
502+    fenc_var = round( sqrt( fenc->i_pixel_ssd[0] ) );
503+    ref_var  = round( sqrt(  ref->i_pixel_ssd[0] ) );
504+    fenc_mean = (float)fenc->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
505+    ref_mean  = (float) ref->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
506 
507     //early termination
508     if( fabs( ref_mean - fenc_mean ) < 0.5 && fabs( 1 - fenc_var / ref_var ) < epsilon )
509@@ -534,8 +515,8 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
510         do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
511         if( do_search[0] )
512         {
513-            if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART
514-                  || h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
515+            if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ||
516+                  h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
517             {
518                 x264_emms();
519                 x264_weights_analyse( h, frames[b], frames[p0], 1 );
520-- 
5211.7.0.4
522
523
524From 794713a35eadcd999d5aab4a50274ca43f29be93 Mon Sep 17 00:00:00 2001
525From: Jason Garrett-Glaser <darkshikari@gmail.com>
526Date: Thu, 27 May 2010 10:42:15 -0700
527Subject: [PATCH 06/11] Add fast skip in lookahead motion search
528 Helps speed very significantly on motionless blocks.
529
530---
531 encoder/slicetype.c |   16 +++++++++++++++-
532 1 files changed, 15 insertions(+), 1 deletions(-)
533
534diff --git a/encoder/slicetype.c b/encoder/slicetype.c
535index e454e12..d7cfe5c 100644
536--- a/encoder/slicetype.c
537+++ b/encoder/slicetype.c
538@@ -379,11 +379,25 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
539                 CP32( m[l].mvp, mvc[0] );
540             else
541                 x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );
542-            x264_me_search( h, &m[l], mvc, i_mvc );
543 
544+            /* Fast skip for cases of near-zero residual.  Shortcut: don't bother except in the mv0 case,
545+             * since anything else is likely to have enough residual to not trigger the skip. */
546+            if( !M32( m[l].mvp ) )
547+            {
548+                m[l].cost = h->pixf.mbcmp[PIXEL_8x8]( m[l].p_fenc[0], FENC_STRIDE, m[l].p_fref[0], m[l].i_stride[0] );
549+                if( m[l].cost < 64 )
550+                {
551+                    M32( m[l].mv ) = 0;
552+                    goto skip_motionest;
553+                }
554+            }
555+
556+            x264_me_search( h, &m[l], mvc, i_mvc );
557             m[l].cost -= 2; // remove mvcost from skip mbs
558             if( M32( m[l].mv ) )
559                 m[l].cost += 5;
560+
561+skip_motionest:
562             CP32( fenc_mvs[l], m[l].mv );
563             *fenc_costs[l] = m[l].cost;
564         }
565-- 
5661.7.0.4
567
568
569From 77b568b22d42baa344dad050aef420de3b22e126 Mon Sep 17 00:00:00 2001
570From: Henrik Gramner <hengar-6@student.ltu.se>
571Date: Thu, 27 May 2010 22:18:38 +0200
572Subject: [PATCH 07/11] Optimize out some x264_scan8 reads
573
574---
575 encoder/analyse.c    |   15 ++++-----
576 encoder/macroblock.c |   82 ++++++++++++++++++++++++++++++--------------------
577 encoder/me.c         |   25 ++++++++-------
578 3 files changed, 70 insertions(+), 52 deletions(-)
579
580diff --git a/encoder/analyse.c b/encoder/analyse.c
581index a128a70..9e85e89 100644
582--- a/encoder/analyse.c
583+++ b/encoder/analyse.c
584@@ -907,8 +907,6 @@ static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
585 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
586 {
587     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
588-
589-    int x, y;
590     uint64_t i_satd, i_best;
591     h->mb.i_skip_intra = 0;
592 
593@@ -1031,8 +1029,9 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
594             int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
595 
596             i_best = COST_MAX64;
597-            x = idx&1;
598-            y = idx>>1;
599+            int x = idx&1;
600+            int y = idx>>1;
601+            int s8 = X264_SCAN8_0 + 2*x + 16*y;
602 
603             p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
604             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
605@@ -1061,8 +1060,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
606                     if( !(idx&1) )
607                         for( int j = 0; j < 7; j++ )
608                             pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
609-                    i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
610-                    i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
611+                    i_nnz[0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8] );
612+                    i_nnz[1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8] );
613                 }
614             }
615             a->i_cbp_i8x8_luma = cbp_luma_new;
616@@ -1070,8 +1069,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
617             if( !(idx&1) )
618                 for( int j = 0; j < 7; j++ )
619                     p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
620-            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
621-            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
622+            M16( &h->mb.cache.non_zero_count[s8 + 0*8] ) = i_nnz[0];
623+            M16( &h->mb.cache.non_zero_count[s8 + 1*8] ) = i_nnz[1];
624 
625             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
626         }
627diff --git a/encoder/macroblock.c b/encoder/macroblock.c
628index 984f8a8..cdc4563 100644
629--- a/encoder/macroblock.c
630+++ b/encoder/macroblock.c
631@@ -135,11 +135,12 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
632     }
633 }
634 
635-#define STORE_8x8_NNZ(idx,nz)\
636+#define STORE_8x8_NNZ( s8, nz )\
637+do\
638 {\
639-    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] ) = nz * 0x0101;\
640-    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] ) = nz * 0x0101;\
641-}
642+    M16( &h->mb.cache.non_zero_count[(s8) + 0*8] ) = (nz) * 0x0101;\
643+    M16( &h->mb.cache.non_zero_count[(s8) + 1*8] ) = (nz) * 0x0101;\
644+} while(0)
645 
646 #define CLEAR_16x16_NNZ \
647 {\
648@@ -151,17 +152,18 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
649 
650 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
651 {
652-    int x = 8 * (idx&1);
653-    int y = 8 * (idx>>1);
654+    int x = idx&1;
655+    int y = idx>>1;
656+    int s8 = X264_SCAN8_0 + 2*x + 16*y;
657     int nz;
658-    uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
659-    uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
660+    uint8_t *p_src = &h->mb.pic.p_fenc[0][8*x + 8*y*FENC_STRIDE];
661+    uint8_t *p_dst = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
662     ALIGNED_ARRAY_16( int16_t, dct8x8,[64] );
663 
664     if( h->mb.b_lossless )
665     {
666         nz = h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
667-        STORE_8x8_NNZ(idx,nz);
668+        STORE_8x8_NNZ( s8, nz );
669         h->mb.i_cbp_luma |= nz<<idx;
670         return;
671     }
672@@ -175,10 +177,10 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
673         h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
674         h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
675         h->dctf.add8x8_idct8( p_dst, dct8x8 );
676-        STORE_8x8_NNZ(idx,1);
677+        STORE_8x8_NNZ( s8, 1 );
678     }
679     else
680-        STORE_8x8_NNZ(idx,0);
681+        STORE_8x8_NNZ( s8, 0 );
682 }
683 
684 static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
685@@ -728,12 +730,13 @@ void x264_macroblock_encode( x264_t *h )
686             if( h->mb.b_transform_8x8 )
687                 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
688                 {
689-                    int x = 8*(i8x8&1);
690-                    int y = 8*(i8x8>>1);
691-                    nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
692-                                        h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
693-                                        h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
694-                    STORE_8x8_NNZ(i8x8,nz);
695+                    int x = i8x8&1;
696+                    int y = i8x8>>1;
697+                    int s8 = X264_SCAN8_0 + 2*x + 16*y;
698+
699+                    nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8], h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE,
700+                                                                   h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE );
701+                    STORE_8x8_NNZ( s8, nz );
702                     h->mb.i_cbp_luma |= nz << i8x8;
703                 }
704             else
705@@ -783,14 +786,18 @@ void x264_macroblock_encode( x264_t *h )
706             {
707                 for( int idx = 0; idx < 4; idx++ )
708                 {
709+                    int x = idx&1;
710+                    int y = idx>>1;
711+                    int s8 = X264_SCAN8_0 + 2*x + 16*y;
712+
713                     if( h->mb.i_cbp_luma&(1<<idx) )
714                     {
715                         h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
716-                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
717-                        STORE_8x8_NNZ(idx,1);
718+                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
719+                        STORE_8x8_NNZ( s8, 1 );
720                     }
721                     else
722-                        STORE_8x8_NNZ(idx,0);
723+                        STORE_8x8_NNZ( s8, 0 );
724                 }
725             }
726         }
727@@ -825,18 +832,24 @@ void x264_macroblock_encode( x264_t *h )
728                     }
729                 }
730 
731+                int x = i8x8&1;
732+                int y = i8x8>>1;
733+
734                 /* decimate this 8x8 block */
735                 i_decimate_mb += i_decimate_8x8;
736                 if( b_decimate )
737                 {
738                     if( i_decimate_8x8 < 4 )
739-                        STORE_8x8_NNZ(i8x8,0)
740+                    {
741+                        int s8 = X264_SCAN8_0 + 2*x + 16*y;
742+                        STORE_8x8_NNZ( s8, 0 );
743+                    }
744                     else
745                         h->mb.i_cbp_luma |= 1<<i8x8;
746                 }
747                 else if( cbp )
748                 {
749-                    h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
750+                    h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
751                     h->mb.i_cbp_luma |= 1<<i8x8;
752                 }
753             }
754@@ -1045,8 +1058,11 @@ void x264_noise_reduction_update( x264_t *h )
755 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
756 {
757     int i_qp = h->mb.i_qp;
758-    uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
759-    uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
760+    int x = i8&1;
761+    int y = i8>>1;
762+    int s8 = X264_SCAN8_0 + 2*x + 16*y;
763+    uint8_t *p_fenc = h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE;
764+    uint8_t *p_fdec = h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE;
765     int b_decimate = h->mb.b_dct_decimate;
766     int nnz8x8 = 0;
767     int nz;
768@@ -1059,7 +1075,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
769         if( h->mb.b_transform_8x8 )
770         {
771             nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
772-            STORE_8x8_NNZ(i8,nnz8x8);
773+            STORE_8x8_NNZ( s8, nnz8x8 );
774         }
775         else
776         {
777@@ -1075,8 +1091,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
778         for( int ch = 0; ch < 2; ch++ )
779         {
780             int16_t dc;
781-            p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
782-            p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
783+            p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
784+            p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
785             nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec, &dc );
786             h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
787         }
788@@ -1099,13 +1115,13 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
789                 {
790                     h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
791                     h->dctf.add8x8_idct8( p_fdec, dct8x8 );
792-                    STORE_8x8_NNZ(i8,1);
793+                    STORE_8x8_NNZ( s8, 1 );
794                 }
795                 else
796-                    STORE_8x8_NNZ(i8,0);
797+                    STORE_8x8_NNZ( s8, 0 );
798             }
799             else
800-                STORE_8x8_NNZ(i8,0);
801+                STORE_8x8_NNZ( s8, 0 );
802         }
803         else
804         {
805@@ -1132,7 +1148,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
806             if( nnz8x8 )
807                 h->dctf.add8x8_idct( p_fdec, dct4x4 );
808             else
809-                STORE_8x8_NNZ(i8,0);
810+                STORE_8x8_NNZ( s8, 0 );
811         }
812 
813         i_qp = h->mb.i_chroma_qp;
814@@ -1140,8 +1156,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
815         for( int ch = 0; ch < 2; ch++ )
816         {
817             ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
818-            p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
819-            p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
820+            p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
821+            p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
822 
823             h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
824             dct4x4[0] = 0;
825diff --git a/encoder/me.c b/encoder/me.c
826index 77073cc..40d0650 100644
827--- a/encoder/me.c
828+++ b/encoder/me.c
829@@ -937,8 +937,11 @@ int x264_iter_kludge = 0;
830 
831 static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
832 {
833-    int16_t *cache0_mv = h->mb.cache.mv[0][x264_scan8[i8*4]];
834-    int16_t *cache1_mv = h->mb.cache.mv[1][x264_scan8[i8*4]];
835+    int x = i8&1;
836+    int y = i8>>1;
837+    int s8 = X264_SCAN8_0 + 2*x + 16*y;
838+    int16_t *cache0_mv = h->mb.cache.mv[0][s8];
839+    int16_t *cache1_mv = h->mb.cache.mv[1][s8];
840     const int i_pixel = m0->i_pixel;
841     const int bw = x264_pixel_size[i_pixel].w;
842     const int bh = x264_pixel_size[i_pixel].h;
843@@ -946,11 +949,11 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
844     ALIGNED_ARRAY_8( uint8_t, pixu_buf,[2],[9][8*8] );
845     ALIGNED_ARRAY_8( uint8_t, pixv_buf,[2],[9][8*8] );
846     uint8_t *src[2][9];
847-    uint8_t *pix  = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8];
848-    uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
849-    uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
850-    const int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]];
851-    const int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]];
852+    uint8_t *pix  = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
853+    uint8_t *pixu = &h->mb.pic.p_fdec[1][4*x + 4*y*FDEC_STRIDE];
854+    uint8_t *pixv = &h->mb.pic.p_fdec[2][4*x + 4*y*FDEC_STRIDE];
855+    int ref0 = h->mb.cache.ref[0][s8];
856+    int ref1 = h->mb.cache.ref[1][s8];
857     const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
858     const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
859     int stride[2][9];
860@@ -1058,13 +1061,13 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
861 
862     if( rd )
863     {
864-        x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
865+        x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
866         amvd = pack8to16( X264_MIN(abs(bm0x - m0->mvp[0]),33), X264_MIN(abs(bm0y - m0->mvp[1]),33) );
867-        x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, amvd );
868+        x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 0, amvd );
869 
870-        x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
871+        x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
872         amvd = pack8to16( X264_MIN(abs(bm1x - m1->mvp[0]),33), X264_MIN(abs(bm1y - m1->mvp[1]),33) );
873-        x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, amvd );
874+        x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 1, amvd );
875     }
876 
877     m0->mv[0] = bm0x;
878-- 
8791.7.0.4
880
881
882From 0c7cf0bfb1d30ee8e7f1b355fef5aa9e2db929d2 Mon Sep 17 00:00:00 2001
883From: Henrik Gramner <hengar-6@student.ltu.se>
884Date: Sun, 30 May 2010 22:45:14 +0200
885Subject: [PATCH 08/11] Some deblocking-related optimizations
886
887---
888 common/deblock.c    |    8 ++++----
889 common/macroblock.c |   43 +++++++++++++++++++++++--------------------
890 2 files changed, 27 insertions(+), 24 deletions(-)
891
892diff --git a/common/deblock.c b/common/deblock.c
893index 27c73ae..3296dbf 100644
894--- a/common/deblock.c
895+++ b/common/deblock.c
896@@ -299,7 +299,7 @@ static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2,
897 void x264_frame_deblock_row( x264_t *h, int mb_y )
898 {
899     int b_interlaced = h->sh.b_mbaff;
900-    int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
901+    int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->param.analyse.i_chroma_qp_offset );
902     int stridey   = h->fdec->i_stride[0];
903     int stride2y  = stridey << b_interlaced;
904     int strideuv  = h->fdec->i_stride[1];
905@@ -318,7 +318,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
906         uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
907         uint8_t *pixu = h->fdec->plane[1] +  8*mb_y*strideuv +  8*mb_x;
908         uint8_t *pixv = h->fdec->plane[2] +  8*mb_y*strideuv +  8*mb_x;
909-        if( b_interlaced && (mb_y&1) )
910+        if( mb_y & b_interlaced )
911         {
912             pixy -= 15*stridey;
913             pixu -=  7*strideuv;
914@@ -366,12 +366,12 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
915             int qp_top = (qp + qpt + 1) >> 1;
916             int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1;
917             int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] );
918-            if( !b_interlaced && (intra_cur || intra_top) )
919+            if( ~b_interlaced & (intra_cur | intra_top) )
920                 FILTER( _intra, 1, 0, qp_top, qpc_top );
921             else
922             {
923                 if( intra_top )
924-                    memset( bs[1][0], 3, sizeof(bs[1][0]) );
925+                    M32( bs[1][0] ) = 0x03030303;
926                 FILTER(       , 1, 0, qp_top, qpc_top );
927             }
928         }
929diff --git a/common/macroblock.c b/common/macroblock.c
930index 01c90d2..26f63f5 100644
931--- a/common/macroblock.c
932+++ b/common/macroblock.c
933@@ -400,9 +400,27 @@ void x264_macroblock_slice_init( x264_t *h )
934                 }
935         }
936     }
937-    if( h->sh.i_type == SLICE_TYPE_P )
938+    else if( h->sh.i_type == SLICE_TYPE_P )
939+    {
940         memset( h->mb.cache.skip, 0, sizeof( h->mb.cache.skip ) );
941 
942+        if( h->sh.i_disable_deblocking_filter_idc != 1 && h->param.analyse.i_weighted_pred )
943+        {
944+            deblock_ref_table(-2) = -2;
945+            deblock_ref_table(-1) = -1;
946+            for( int i = 0; i < h->i_ref0 << h->sh.b_mbaff; i++ )
947+            {
948+                /* Mask off high bits to avoid frame num collisions with -1/-2.
949+                 * In current x264 frame num values don't cover a range of more
950+                 * than 32, so 6 bits is enough for uniqueness. */
951+                if( !h->mb.b_interlaced )
952+                    deblock_ref_table(i) = h->fref0[i]->i_frame_num&63;
953+                else
954+                    deblock_ref_table(i) = ((h->fref0[i>>1]->i_frame_num&63)<<1) + (i&1);
955+            }
956+        }
957+    }
958+
959     /* init with not available (for top right idx=7,15) */
960     memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) );
961 
962@@ -418,19 +436,6 @@ void x264_macroblock_slice_init( x264_t *h )
963             h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta;
964         }
965 
966-    deblock_ref_table(-2) = -2;
967-    deblock_ref_table(-1) = -1;
968-    for( int i = 0; i < h->i_ref0 << h->sh.b_mbaff; i++ )
969-    {
970-        /* Mask off high bits to avoid frame num collisions with -1/-2.
971-         * In current x264 frame num values don't cover a range of more
972-         * than 32, so 6 bits is enough for uniqueness. */
973-        if( !h->mb.b_interlaced )
974-            deblock_ref_table(i) = h->fref0[i]->i_frame_num&63;
975-        else
976-            deblock_ref_table(i) = ((h->fref0[i>>1]->i_frame_num&63)<<1) + (i&1);
977-    }
978-
979     h->mb.i_neighbour4[6] =
980     h->mb.i_neighbour4[9] =
981     h->mb.i_neighbour4[12] =
982@@ -894,7 +899,6 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
983 void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
984 {
985     int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
986-    int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
987 
988     h->mb.i_neighbour = 0;
989     h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
990@@ -906,9 +910,9 @@ void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_
991             h->mb.i_neighbour |= MB_LEFT;
992     }
993 
994-    if( top >= 0 )
995+    if( mb_y > h->mb.b_interlaced )
996     {
997-        h->mb.i_mb_top_xy = top;
998+        h->mb.i_mb_top_xy = h->mb.i_mb_xy - (h->mb.i_mb_stride << h->mb.b_interlaced);
999         if( deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy] )
1000             h->mb.i_neighbour |= MB_TOP;
1001     }
1002@@ -930,8 +934,6 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
1003         h->mb.i_neighbour &= ~old_neighbour;
1004         if( h->mb.i_neighbour )
1005         {
1006-            int left = h->mb.i_mb_left_xy;
1007-            int top  = h->mb.i_mb_top_xy;
1008             int top_y = mb_y - (1 << h->mb.b_interlaced);
1009             int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
1010             int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
1011@@ -941,10 +943,11 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
1012             uint8_t (*nnz)[24] = h->mb.non_zero_count;
1013 
1014             if( h->mb.i_neighbour & MB_TOP )
1015-                CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
1016+                CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] );
1017 
1018             if( h->mb.i_neighbour & MB_LEFT )
1019             {
1020+                int left = h->mb.i_mb_left_xy;
1021                 h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
1022                 h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
1023                 h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
1024-- 
10251.7.0.4
1026
1027
1028From bdc68d651db64045aecb28f27e0e05e027ab48eb Mon Sep 17 00:00:00 2001
1029From: Jason Garrett-Glaser <darkshikari@gmail.com>
1030Date: Fri, 28 May 2010 14:30:07 -0700
1031Subject: [PATCH 09/11] Re-enable i8x8 merged SATD
1032 Accidentally got disabled when intra_sad_x3 was added.
1033
1034---
1035 encoder/encoder.c |    1 +
1036 1 files changed, 1 insertions(+), 0 deletions(-)
1037
1038diff --git a/encoder/encoder.c b/encoder/encoder.c
1039index 6e0dc54..7717ea8 100644
1040--- a/encoder/encoder.c
1041+++ b/encoder/encoder.c
1042@@ -810,6 +810,7 @@ static void mbcmp_init( x264_t *h )
1043     memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
1044     h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
1045     h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
1046+    h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8;
1047     h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
1048     satd &= h->param.analyse.i_me_method == X264_ME_TESA;
1049     memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
1050-- 
10511.7.0.4
1052
1053
1054From c211bfffa59599e6a90df2e0fd00f4ae9e01ada0 Mon Sep 17 00:00:00 2001
1055From: Jason Garrett-Glaser <darkshikari@gmail.com>
1056Date: Thu, 27 May 2010 14:27:32 -0700
1057Subject: [PATCH 10/11] x86 assembly code for NAL escaping
1058 Up to ~10x faster than C depending on CPU.
1059 Helps the most at very high bitrates (e.g. lossless).
1060 Also make the C code faster and simpler.
1061
1062---
1063 Makefile                   |    4 +-
1064 common/bitstream.c         |   92 ++++++++++++++
1065 common/bitstream.h         |  299 ++++++++++++++++++++++++++++++++++++++++++++
1066 common/bs.h                |  291 ------------------------------------------
1067 common/common.c            |   54 --------
1068 common/common.h            |    5 +-
1069 common/x86/bitstream-a.asm |  112 +++++++++++++++++
1070 common/x86/deblock-a.asm   |    1 +
1071 encoder/encoder.c          |    3 +-
1072 tools/checkasm.c           |   52 ++++++++-
1073 10 files changed, 561 insertions(+), 352 deletions(-)
1074 create mode 100644 common/bitstream.c
1075 create mode 100644 common/bitstream.h
1076 delete mode 100644 common/bs.h
1077 create mode 100644 common/x86/bitstream-a.asm
1078
1079diff --git a/Makefile b/Makefile
1080index 0b43a3e..519e181 100644
1081--- a/Makefile
1082+++ b/Makefile
1083@@ -8,7 +8,7 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
1084        common/frame.c common/dct.c common/cpu.c common/cabac.c \
1085        common/common.c common/mdate.c common/rectangle.c \
1086        common/set.c common/quant.c common/deblock.c common/vlc.c \
1087-       common/mvpred.c \
1088+       common/mvpred.c common/bitstream.c \
1089        encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
1090        encoder/set.c encoder/macroblock.c encoder/cabac.c \
1091        encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
1092@@ -52,7 +52,7 @@ endif
1093 ifneq ($(AS),)
1094 X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
1095           mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
1096-          cpu-a.asm dct-32.asm
1097+          cpu-a.asm dct-32.asm bitstream-a.asm
1098 X86SRC = $(X86SRC0:%=common/x86/%)
1099 
1100 ifeq ($(ARCH),X86)
1101diff --git a/common/bitstream.c b/common/bitstream.c
1102new file mode 100644
1103index 0000000..0aaac21
1104--- /dev/null
1105+++ b/common/bitstream.c
1106@@ -0,0 +1,92 @@
1107+/*****************************************************************************
1108+ * bitstream.c: h264 encoder library
1109+ *****************************************************************************
1110+ * Copyright (C) 2010 x264 project
1111+ *
1112+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
1113+ *          Jason Garrett-Glaser <darkshikari@gmail.com>
1114+ *
1115+ * This program is free software; you can redistribute it and/or modify
1116+ * it under the terms of the GNU General Public License as published by
1117+ * the Free Software Foundation; either version 2 of the License, or
1118+ * (at your option) any later version.
1119+ *
1120+ * This program is distributed in the hope that it will be useful,
1121+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1122+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1123+ * GNU General Public License for more details.
1124+ *
1125+ * You should have received a copy of the GNU General Public License
1126+ * along with this program; if not, write to the Free Software
1127+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
1128+ *****************************************************************************/
1129+
1130+#include "common.h"
1131+
1132+static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
1133+{
1134+    if( src < end ) *dst++ = *src++;
1135+    if( src < end ) *dst++ = *src++;
1136+    while( src < end )
1137+    {
1138+        if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
1139+            *dst++ = 0x03;
1140+        *dst++ = *src++;
1141+    }
1142+    return dst;
1143+}
1144+
1145+#ifdef HAVE_MMX
1146+uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
1147+uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
1148+#endif
1149+
1150+/****************************************************************************
1151+ * x264_nal_encode:
1152+ ****************************************************************************/
1153+int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode )
1154+{
1155+    uint8_t *src = nal->p_payload;
1156+    uint8_t *end = nal->p_payload + nal->i_payload;
1157+    uint8_t *orig_dst = dst;
1158+
1159+    if( h->param.b_annexb )
1160+    {
1161+        if( b_long_startcode )
1162+            *dst++ = 0x00;
1163+        *dst++ = 0x00;
1164+        *dst++ = 0x00;
1165+        *dst++ = 0x01;
1166+    }
1167+    else /* save room for size later */
1168+        dst += 4;
1169+
1170+    /* nal header */
1171+    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
1172+
1173+    dst = h->bsf.nal_escape( dst, src, end );
1174+    int size = (dst - orig_dst) - 4;
1175+
1176+    /* Write the size header for mp4/etc */
1177+    if( !h->param.b_annexb )
1178+    {
1179+        /* Size doesn't include the size of the header we're writing now. */
1180+        orig_dst[0] = size>>24;
1181+        orig_dst[1] = size>>16;
1182+        orig_dst[2] = size>> 8;
1183+        orig_dst[3] = size>> 0;
1184+    }
1185+
1186+    return size+4;
1187+}
1188+
1189+void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
1190+{
1191+    pf->nal_escape = x264_nal_escape_c;
1192+#ifdef HAVE_MMX
1193+    if( cpu&X264_CPU_MMXEXT )
1194+        pf->nal_escape = x264_nal_escape_mmxext;
1195+    if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
1196+        pf->nal_escape = x264_nal_escape_sse2;
1197+#endif
1198+}
1199diff --git a/common/bitstream.h b/common/bitstream.h
1200new file mode 100644
1201index 0000000..d018c7d
1202--- /dev/null
1203+++ b/common/bitstream.h
1204@@ -0,0 +1,299 @@
1205+/*****************************************************************************
1206+ * bitstream.h: h264 encoder library
1207+ *****************************************************************************
1208+ * Copyright (C) 2003-2008 x264 project
1209+ *
1210+ * Authors: Loren Merritt <lorenm@u.washington.edu>
1211+ *          Jason Garrett-Glaser <darkshikari@gmail.com>
1212+ *          Laurent Aimar <fenrir@via.ecp.fr>
1213+ *
1214+ * This program is free software; you can redistribute it and/or modify
1215+ * it under the terms of the GNU General Public License as published by
1216+ * the Free Software Foundation; either version 2 of the License, or
1217+ * (at your option) any later version.
1218+ *
1219+ * This program is distributed in the hope that it will be useful,
1220+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1221+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1222+ * GNU General Public License for more details.
1223+ *
1224+ * You should have received a copy of the GNU General Public License
1225+ * along with this program; if not, write to the Free Software
1226+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
1227+ *****************************************************************************/
1228+
1229+#ifndef X264_BS_H
1230+#define X264_BS_H
1231+
1232+typedef struct
1233+{
1234+    uint8_t i_bits;
1235+    uint8_t i_size;
1236+} vlc_t;
1237+
1238+typedef struct
1239+{
1240+    uint16_t i_bits;
1241+    uint8_t  i_size;
1242+    /* Next level table to use */
1243+    uint8_t  i_next;
1244+} vlc_large_t;
1245+
1246+typedef struct bs_s
1247+{
1248+    uint8_t *p_start;
1249+    uint8_t *p;
1250+    uint8_t *p_end;
1251+
1252+    intptr_t cur_bits;
1253+    int     i_left;    /* i_count number of available bits */
1254+    int     i_bits_encoded; /* RD only */
1255+} bs_t;
1256+
1257+typedef struct
1258+{
1259+    int     last;
1260+    int16_t level[16];
1261+    uint8_t run[16];
1262+} x264_run_level_t;
1263+
1264+extern const vlc_t x264_coeff0_token[5];
1265+extern const vlc_t x264_coeff_token[5][16][4];
1266+extern const vlc_t x264_total_zeros[15][16];
1267+extern const vlc_t x264_total_zeros_dc[3][4];
1268+extern const vlc_t x264_run_before[7][16];
1269+
1270+typedef struct
1271+{
1272+    uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end );
1273+} x264_bitstream_function_t;
1274+
1275+int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode );
1276+void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
1277+
1278+/* A larger level table size theoretically could help a bit at extremely
1279+ * high bitrates, but the cost in cache is usually too high for it to be
1280+ * useful.
1281+ * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
1282+ * FIXME: Do further testing? */
1283+#define LEVEL_TABLE_SIZE 128
1284+extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
1285+
1286+static inline void bs_init( bs_t *s, void *p_data, int i_data )
1287+{
1288+    int offset = ((intptr_t)p_data & 3);
1289+    s->p       = s->p_start = (uint8_t*)p_data - offset;
1290+    s->p_end   = (uint8_t*)p_data + i_data;
1291+    s->i_left  = (WORD_SIZE - offset)*8;
1292+    s->cur_bits = endian_fix32( M32(s->p) );
1293+    s->cur_bits >>= (4-offset)*8;
1294+}
1295+static inline int bs_pos( bs_t *s )
1296+{
1297+    return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
1298+}
1299+
1300+/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
1301+static inline void bs_flush( bs_t *s )
1302+{
1303+    M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
1304+    s->p += WORD_SIZE - s->i_left / 8;
1305+    s->i_left = WORD_SIZE*8;
1306+}
1307+/* The inverse of bs_flush: prepare the bitstream to be written to again. */
1308+static inline void bs_realign( bs_t *s )
1309+{
1310+    int offset = ((intptr_t)s->p & 3);
1311+    if( offset )
1312+    {
1313+        s->p       = (uint8_t*)s->p - offset;
1314+        s->i_left  = (WORD_SIZE - offset)*8;
1315+        s->cur_bits = endian_fix32( M32(s->p) );
1316+        s->cur_bits >>= (4-offset)*8;
1317+    }
1318+}
1319+
1320+static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
1321+{
1322+    if( WORD_SIZE == 8 )
1323+    {
1324+        s->cur_bits = (s->cur_bits << i_count) | i_bits;
1325+        s->i_left -= i_count;
1326+        if( s->i_left <= 32 )
1327+        {
1328+#ifdef WORDS_BIGENDIAN
1329+            M32( s->p ) = s->cur_bits >> (32 - s->i_left);
1330+#else
1331+            M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
1332+#endif
1333+            s->i_left += 32;
1334+            s->p += 4;
1335+        }
1336+    }
1337+    else
1338+    {
1339+        if( i_count < s->i_left )
1340+        {
1341+            s->cur_bits = (s->cur_bits << i_count) | i_bits;
1342+            s->i_left -= i_count;
1343+        }
1344+        else
1345+        {
1346+            i_count -= s->i_left;
1347+            s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
1348+            M32( s->p ) = endian_fix( s->cur_bits );
1349+            s->p += 4;
1350+            s->cur_bits = i_bits;
1351+            s->i_left = 32 - i_count;
1352+        }
1353+    }
1354+}
1355+
1356+/* Special case to eliminate branch in normal bs_write. */
1357+/* Golomb never writes an even-size code, so this is only used in slice headers. */
1358+static inline void bs_write32( bs_t *s, uint32_t i_bits )
1359+{
1360+    bs_write( s, 16, i_bits >> 16 );
1361+    bs_write( s, 16, i_bits );
1362+}
1363+
1364+static inline void bs_write1( bs_t *s, uint32_t i_bit )
1365+{
1366+    s->cur_bits <<= 1;
1367+    s->cur_bits |= i_bit;
1368+    s->i_left--;
1369+    if( s->i_left == WORD_SIZE*8-32 )
1370+    {
1371+        M32( s->p ) = endian_fix32( s->cur_bits );
1372+        s->p += 4;
1373+        s->i_left = WORD_SIZE*8;
1374+    }
1375+}
1376+
1377+static inline void bs_align_0( bs_t *s )
1378+{
1379+    bs_write( s, s->i_left&7, 0 );
1380+    bs_flush( s );
1381+}
1382+static inline void bs_align_1( bs_t *s )
1383+{
1384+    bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
1385+    bs_flush( s );
1386+}
1387+static inline void bs_align_10( bs_t *s )
1388+{
1389+    if( s->i_left&7 )
1390+        bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
1391+}
1392+
1393+/* golomb functions */
1394+
1395+static const uint8_t x264_ue_size_tab[256] =
1396+{
1397+     1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
1398+     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
1399+    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
1400+    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
1401+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1402+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1403+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1404+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1405+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1406+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1407+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1408+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1409+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1410+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1411+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1412+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1413+};
1414+
1415+static inline void bs_write_ue_big( bs_t *s, unsigned int val )
1416+{
1417+    int size = 0;
1418+    int tmp = ++val;
1419+    if( tmp >= 0x10000 )
1420+    {
1421+        size = 32;
1422+        tmp >>= 16;
1423+    }
1424+    if( tmp >= 0x100 )
1425+    {
1426+        size += 16;
1427+        tmp >>= 8;
1428+    }
1429+    size += x264_ue_size_tab[tmp];
1430+    bs_write( s, size>>1, 0 );
1431+    bs_write( s, (size>>1)+1, val );
1432+}
1433+
1434+/* Only works on values under 255. */
1435+static inline void bs_write_ue( bs_t *s, int val )
1436+{
1437+    bs_write( s, x264_ue_size_tab[val+1], val+1 );
1438+}
1439+
1440+static inline void bs_write_se( bs_t *s, int val )
1441+{
1442+    int size = 0;
1443+    /* Faster than (val <= 0 ? -val*2+1 : val*2) */
1444+    /* 4 instructions on x86, 3 on ARM */
1445+    int tmp = 1 - val*2;
1446+    if( tmp < 0 ) tmp = val*2;
1447+    val = tmp;
1448+
1449+    if( tmp >= 0x100 )
1450+    {
1451+        size = 16;
1452+        tmp >>= 8;
1453+    }
1454+    size += x264_ue_size_tab[tmp];
1455+    bs_write( s, size, val );
1456+}
1457+
1458+static inline void bs_write_te( bs_t *s, int x, int val )
1459+{
1460+    if( x == 1 )
1461+        bs_write1( s, 1^val );
1462+    else //if( x > 1 )
1463+        bs_write_ue( s, val );
1464+}
1465+
1466+static inline void bs_rbsp_trailing( bs_t *s )
1467+{
1468+    bs_write1( s, 1 );
1469+    bs_write( s, s->i_left&7, 0  );
1470+}
1471+
1472+static ALWAYS_INLINE int bs_size_ue( unsigned int val )
1473+{
1474+    return x264_ue_size_tab[val+1];
1475+}
1476+
1477+static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
1478+{
1479+    if( val < 255 )
1480+        return x264_ue_size_tab[val+1];
1481+    else
1482+        return x264_ue_size_tab[(val+1)>>8] + 16;
1483+}
1484+
1485+static ALWAYS_INLINE int bs_size_se( int val )
1486+{
1487+    int tmp = 1 - val*2;
1488+    if( tmp < 0 ) tmp = val*2;
1489+    if( tmp < 256 )
1490+        return x264_ue_size_tab[tmp];
1491+    else
1492+        return x264_ue_size_tab[tmp>>8]+16;
1493+}
1494+
1495+static ALWAYS_INLINE int bs_size_te( int x, int val )
1496+{
1497+    if( x == 1 )
1498+        return 1;
1499+    else //if( x > 1 )
1500+        return x264_ue_size_tab[val+1];
1501+}
1502+
1503+#endif
1504diff --git a/common/bs.h b/common/bs.h
1505deleted file mode 100644
1506index 343a3c9..0000000
1507--- a/common/bs.h
1508+++ /dev/null
1509@@ -1,291 +0,0 @@
1510-/*****************************************************************************
1511- * bs.h :
1512- *****************************************************************************
1513- * Copyright (C) 2003-2008 x264 project
1514- *
1515- * Authors: Loren Merritt <lorenm@u.washington.edu>
1516- *          Jason Garrett-Glaser <darkshikari@gmail.com>
1517- *          Laurent Aimar <fenrir@via.ecp.fr>
1518- *
1519- * This program is free software; you can redistribute it and/or modify
1520- * it under the terms of the GNU General Public License as published by
1521- * the Free Software Foundation; either version 2 of the License, or
1522- * (at your option) any later version.
1523- *
1524- * This program is distributed in the hope that it will be useful,
1525- * but WITHOUT ANY WARRANTY; without even the implied warranty of
1526- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1527- * GNU General Public License for more details.
1528- *
1529- * You should have received a copy of the GNU General Public License
1530- * along with this program; if not, write to the Free Software
1531- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
1532- *****************************************************************************/
1533-
1534-#ifndef X264_BS_H
1535-#define X264_BS_H
1536-
1537-typedef struct
1538-{
1539-    uint8_t i_bits;
1540-    uint8_t i_size;
1541-} vlc_t;
1542-
1543-typedef struct
1544-{
1545-    uint16_t i_bits;
1546-    uint8_t  i_size;
1547-    /* Next level table to use */
1548-    uint8_t  i_next;
1549-} vlc_large_t;
1550-
1551-typedef struct bs_s
1552-{
1553-    uint8_t *p_start;
1554-    uint8_t *p;
1555-    uint8_t *p_end;
1556-
1557-    intptr_t cur_bits;
1558-    int     i_left;    /* i_count number of available bits */
1559-    int     i_bits_encoded; /* RD only */
1560-} bs_t;
1561-
1562-typedef struct
1563-{
1564-    int     last;
1565-    int16_t level[16];
1566-    uint8_t run[16];
1567-} x264_run_level_t;
1568-
1569-extern const vlc_t x264_coeff0_token[5];
1570-extern const vlc_t x264_coeff_token[5][16][4];
1571-extern const vlc_t x264_total_zeros[15][16];
1572-extern const vlc_t x264_total_zeros_dc[3][4];
1573-extern const vlc_t x264_run_before[7][16];
1574-
1575-/* A larger level table size theoretically could help a bit at extremely
1576- * high bitrates, but the cost in cache is usually too high for it to be
1577- * useful.
1578- * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
1579- * FIXME: Do further testing? */
1580-#define LEVEL_TABLE_SIZE 128
1581-extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
1582-
1583-static inline void bs_init( bs_t *s, void *p_data, int i_data )
1584-{
1585-    int offset = ((intptr_t)p_data & 3);
1586-    s->p       = s->p_start = (uint8_t*)p_data - offset;
1587-    s->p_end   = (uint8_t*)p_data + i_data;
1588-    s->i_left  = (WORD_SIZE - offset)*8;
1589-    s->cur_bits = endian_fix32( M32(s->p) );
1590-    s->cur_bits >>= (4-offset)*8;
1591-}
1592-static inline int bs_pos( bs_t *s )
1593-{
1594-    return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
1595-}
1596-
1597-/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
1598-static inline void bs_flush( bs_t *s )
1599-{
1600-    M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
1601-    s->p += WORD_SIZE - s->i_left / 8;
1602-    s->i_left = WORD_SIZE*8;
1603-}
1604-/* The inverse of bs_flush: prepare the bitstream to be written to again. */
1605-static inline void bs_realign( bs_t *s )
1606-{
1607-    int offset = ((intptr_t)s->p & 3);
1608-    if( offset )
1609-    {
1610-        s->p       = (uint8_t*)s->p - offset;
1611-        s->i_left  = (WORD_SIZE - offset)*8;
1612-        s->cur_bits = endian_fix32( M32(s->p) );
1613-        s->cur_bits >>= (4-offset)*8;
1614-    }
1615-}
1616-
1617-static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
1618-{
1619-    if( WORD_SIZE == 8 )
1620-    {
1621-        s->cur_bits = (s->cur_bits << i_count) | i_bits;
1622-        s->i_left -= i_count;
1623-        if( s->i_left <= 32 )
1624-        {
1625-#ifdef WORDS_BIGENDIAN
1626-            M32( s->p ) = s->cur_bits >> (32 - s->i_left);
1627-#else
1628-            M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
1629-#endif
1630-            s->i_left += 32;
1631-            s->p += 4;
1632-        }
1633-    }
1634-    else
1635-    {
1636-        if( i_count < s->i_left )
1637-        {
1638-            s->cur_bits = (s->cur_bits << i_count) | i_bits;
1639-            s->i_left -= i_count;
1640-        }
1641-        else
1642-        {
1643-            i_count -= s->i_left;
1644-            s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
1645-            M32( s->p ) = endian_fix( s->cur_bits );
1646-            s->p += 4;
1647-            s->cur_bits = i_bits;
1648-            s->i_left = 32 - i_count;
1649-        }
1650-    }
1651-}
1652-
1653-/* Special case to eliminate branch in normal bs_write. */
1654-/* Golomb never writes an even-size code, so this is only used in slice headers. */
1655-static inline void bs_write32( bs_t *s, uint32_t i_bits )
1656-{
1657-    bs_write( s, 16, i_bits >> 16 );
1658-    bs_write( s, 16, i_bits );
1659-}
1660-
1661-static inline void bs_write1( bs_t *s, uint32_t i_bit )
1662-{
1663-    s->cur_bits <<= 1;
1664-    s->cur_bits |= i_bit;
1665-    s->i_left--;
1666-    if( s->i_left == WORD_SIZE*8-32 )
1667-    {
1668-        M32( s->p ) = endian_fix32( s->cur_bits );
1669-        s->p += 4;
1670-        s->i_left = WORD_SIZE*8;
1671-    }
1672-}
1673-
1674-static inline void bs_align_0( bs_t *s )
1675-{
1676-    bs_write( s, s->i_left&7, 0 );
1677-    bs_flush( s );
1678-}
1679-static inline void bs_align_1( bs_t *s )
1680-{
1681-    bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
1682-    bs_flush( s );
1683-}
1684-static inline void bs_align_10( bs_t *s )
1685-{
1686-    if( s->i_left&7 )
1687-        bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
1688-}
1689-
1690-/* golomb functions */
1691-
1692-static const uint8_t x264_ue_size_tab[256] =
1693-{
1694-     1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
1695-     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
1696-    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
1697-    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
1698-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1699-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1700-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1701-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1702-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1703-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1704-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1705-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1706-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1707-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1708-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1709-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1710-};
1711-
1712-static inline void bs_write_ue_big( bs_t *s, unsigned int val )
1713-{
1714-    int size = 0;
1715-    int tmp = ++val;
1716-    if( tmp >= 0x10000 )
1717-    {
1718-        size = 32;
1719-        tmp >>= 16;
1720-    }
1721-    if( tmp >= 0x100 )
1722-    {
1723-        size += 16;
1724-        tmp >>= 8;
1725-    }
1726-    size += x264_ue_size_tab[tmp];
1727-    bs_write( s, size>>1, 0 );
1728-    bs_write( s, (size>>1)+1, val );
1729-}
1730-
1731-/* Only works on values under 255. */
1732-static inline void bs_write_ue( bs_t *s, int val )
1733-{
1734-    bs_write( s, x264_ue_size_tab[val+1], val+1 );
1735-}
1736-
1737-static inline void bs_write_se( bs_t *s, int val )
1738-{
1739-    int size = 0;
1740-    /* Faster than (val <= 0 ? -val*2+1 : val*2) */
1741-    /* 4 instructions on x86, 3 on ARM */
1742-    int tmp = 1 - val*2;
1743-    if( tmp < 0 ) tmp = val*2;
1744-    val = tmp;
1745-
1746-    if( tmp >= 0x100 )
1747-    {
1748-        size = 16;
1749-        tmp >>= 8;
1750-    }
1751-    size += x264_ue_size_tab[tmp];
1752-    bs_write( s, size, val );
1753-}
1754-
1755-static inline void bs_write_te( bs_t *s, int x, int val )
1756-{
1757-    if( x == 1 )
1758-        bs_write1( s, 1^val );
1759-    else //if( x > 1 )
1760-        bs_write_ue( s, val );
1761-}
1762-
1763-static inline void bs_rbsp_trailing( bs_t *s )
1764-{
1765-    bs_write1( s, 1 );
1766-    bs_write( s, s->i_left&7, 0  );
1767-}
1768-
1769-static ALWAYS_INLINE int bs_size_ue( unsigned int val )
1770-{
1771-    return x264_ue_size_tab[val+1];
1772-}
1773-
1774-static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
1775-{
1776-    if( val < 255 )
1777-        return x264_ue_size_tab[val+1];
1778-    else
1779-        return x264_ue_size_tab[(val+1)>>8] + 16;
1780-}
1781-
1782-static ALWAYS_INLINE int bs_size_se( int val )
1783-{
1784-    int tmp = 1 - val*2;
1785-    if( tmp < 0 ) tmp = val*2;
1786-    if( tmp < 256 )
1787-        return x264_ue_size_tab[tmp];
1788-    else
1789-        return x264_ue_size_tab[tmp>>8]+16;
1790-}
1791-
1792-static ALWAYS_INLINE int bs_size_te( int x, int val )
1793-{
1794-    if( x == 1 )
1795-        return 1;
1796-    else //if( x > 1 )
1797-        return x264_ue_size_tab[val+1];
1798-}
1799-
1800-#endif
1801diff --git a/common/common.c b/common/common.c
1802index fccf2b0..2458f65 100644
1803--- a/common/common.c
1804+++ b/common/common.c
1805@@ -1027,60 +1027,6 @@ void x264_picture_clean( x264_picture_t *pic )
1806 }
1807 
1808 /****************************************************************************
1809- * x264_nal_encode:
1810- ****************************************************************************/
1811-int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode )
1812-{
1813-    uint8_t *src = nal->p_payload;
1814-    uint8_t *end = nal->p_payload + nal->i_payload;
1815-    uint8_t *orig_dst = dst;
1816-    int i_count = 0, size;
1817-
1818-    if( b_annexb )
1819-    {
1820-        if( b_long_startcode )
1821-            *dst++ = 0x00;
1822-        *dst++ = 0x00;
1823-        *dst++ = 0x00;
1824-        *dst++ = 0x01;
1825-    }
1826-    else /* save room for size later */
1827-        dst += 4;
1828-
1829-    /* nal header */
1830-    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
1831-
1832-    while( src < end )
1833-    {
1834-        if( i_count == 2 && *src <= 0x03 )
1835-        {
1836-            *dst++ = 0x03;
1837-            i_count = 0;
1838-        }
1839-        if( *src == 0 )
1840-            i_count++;
1841-        else
1842-            i_count = 0;
1843-        *dst++ = *src++;
1844-    }
1845-    size = (dst - orig_dst) - 4;
1846-
1847-    /* Write the size header for mp4/etc */
1848-    if( !b_annexb )
1849-    {
1850-        /* Size doesn't include the size of the header we're writing now. */
1851-        orig_dst[0] = size>>24;
1852-        orig_dst[1] = size>>16;
1853-        orig_dst[2] = size>> 8;
1854-        orig_dst[3] = size>> 0;
1855-    }
1856-
1857-    return size+4;
1858-}
1859-
1860-
1861-
1862-/****************************************************************************
1863  * x264_malloc:
1864  ****************************************************************************/
1865 void *x264_malloc( int i_size )
1866diff --git a/common/common.h b/common/common.h
1867index 539ea65..93712fe 100644
1868--- a/common/common.h
1869+++ b/common/common.h
1870@@ -137,7 +137,7 @@ static const int x264_scan8[16+2*4+3] =
1871 */
1872 
1873 #include "x264.h"
1874-#include "bs.h"
1875+#include "bitstream.h"
1876 #include "set.h"
1877 #include "predict.h"
1878 #include "pixel.h"
1879@@ -166,8 +166,6 @@ int64_t x264_mdate( void );
1880  * the encoding options */
1881 char *x264_param2string( x264_param_t *p, int b_res );
1882 
1883-int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode );
1884-
1885 /* log */
1886 void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
1887 
1888@@ -796,6 +794,7 @@ struct x264_t
1889     x264_zigzag_function_t zigzagf;
1890     x264_quant_function_t quantf;
1891     x264_deblock_function_t loopf;
1892+    x264_bitstream_function_t bsf;
1893 
1894 #ifdef HAVE_VISUALIZE
1895     struct visualize_t *visualize;
1896diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
1897new file mode 100644
1898index 0000000..1fb4cea
1899--- /dev/null
1900+++ b/common/x86/bitstream-a.asm
1901@@ -0,0 +1,112 @@
1902+;*****************************************************************************
1903+;* bitstream-a.asm: h264 encoder library
1904+;*****************************************************************************
1905+;* Copyright (C) 2010 x264 project
1906+;*
1907+;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
1908+;*
1909+;* This program is free software; you can redistribute it and/or modify
1910+;* it under the terms of the GNU General Public License as published by
1911+;* the Free Software Foundation; either version 2 of the License, or
1912+;* (at your option) any later version.
1913+;*
1914+;* This program is distributed in the hope that it will be useful,
1915+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
1916+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1917+;* GNU General Public License for more details.
1918+;*
1919+;* You should have received a copy of the GNU General Public License
1920+;* along with this program; if not, write to the Free Software
1921+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
1922+;*****************************************************************************
1923+
1924+%include "x86inc.asm"
1925+%include "x86util.asm"
1926+
1927+SECTION .text
1928+
1929+;-----------------------------------------------------------------------------
1930+; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
1931+;-----------------------------------------------------------------------------
1932+
1933+%macro NAL_LOOP 2
1934+ALIGN 16
1935+%1:
1936+    mova      m0, [r1+r2]
1937+    mova      m1, m0
1938+%if mmsize == 8
1939+    psrlq     m0, 8
1940+%else
1941+    psrldq    m0, 1
1942+%endif
1943+    %2   [r0+r1], m1
1944+    por       m1, m0
1945+    pcmpeqb   m1, m2
1946+    pmovmskb r3d, m1
1947+    test     r3d, r3d
1948+    jnz .escape
1949+    add       r1, mmsize
1950+    jl %1
1951+%endmacro
1952+
1953+%macro NAL_ESCAPE 1
1954+
1955+cglobal nal_escape_%1, 3,5
1956+    pxor      m2, m2
1957+    sub       r1, r2 ; r1 = offset of current src pointer from end of src
1958+    sub       r0, r1 ; r0 = projected end of dst, assuming no more escapes
1959+
1960+    mov      r3w, [r1+r2]
1961+    mov  [r0+r1], r3w
1962+    add       r1, 2
1963+    jge .ret
1964+
1965+    ; Start off by jumping into the escape loop in
1966+    ; case there's an escape at the start.
1967+    ; And do a few more in scalar until src is aligned again.
1968+    lea      r4d, [r1+r2]
1969+    or       r4d, -mmsize
1970+    neg      r4d
1971+    jmp .escapeloop
1972+
1973+    NAL_LOOP .loop_aligned, mova
1974+%if mmsize==16
1975+    NAL_LOOP .loop_unaligned, movu
1976+%endif
1977+
1978+.ret:
1979+    movifnidn rax, r0
1980+    RET
1981+ALIGN 16
1982+.escape:
1983+    mov      r4d, mmsize
1984+.escapeloop:
1985+    mov      r3b, [r1+r2]
1986+    cmp      r3b, 3
1987+    jna .escape_check
1988+.copy:
1989+    mov  [r0+r1], r3b
1990+    inc      r1
1991+    jge .ret
1992+    dec      r4d
1993+    jg .escapeloop
1994+    cmp byte [r1+r2-1], 0 ; Don't go back to the main loop until we're out of a zero-run.
1995+    jz .escape
1996+%if mmsize==16
1997+    lea      r4d, [r0+r1]
1998+    test     r4d, mmsize-1
1999+    jnz .loop_unaligned
2000+%endif
2001+    jmp .loop_aligned
2002+.escape_check:
2003+    cmp word [r0+r1-2], 0
2004+    jnz .copy
2005+    mov byte [r0+r1], 3
2006+    inc      r0
2007+    jmp .copy
2008+%endmacro
2009+
2010+INIT_MMX
2011+NAL_ESCAPE mmxext
2012+INIT_XMM
2013+NAL_ESCAPE sse2
2014diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
2015index aedd688..3a31e26 100644
2016--- a/common/x86/deblock-a.asm
2017+++ b/common/x86/deblock-a.asm
2018@@ -4,6 +4,7 @@
2019 ;* Copyright (C) 2005-2008 x264 project
2020 ;*
2021 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
2022+;*          Jason Garrett-Glaser <darkshikari@gmail.com>
2023 ;*
2024 ;* This program is free software; you can redistribute it and/or modify
2025 ;* it under the terms of the GNU General Public License as published by
2026diff --git a/encoder/encoder.c b/encoder/encoder.c
2027index 7717ea8..2f9e7f6 100644
2028--- a/encoder/encoder.c
2029+++ b/encoder/encoder.c
2030@@ -987,6 +987,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
2031     x264_mc_init( h->param.cpu, &h->mc );
2032     x264_quant_init( h, h->param.cpu, &h->quantf );
2033     x264_deblock_init( h->param.cpu, &h->loopf );
2034+    x264_bitstream_init( h->param.cpu, &h->bsf );
2035     x264_dct_init_weights();
2036 
2037     mbcmp_init( h );
2038@@ -1273,7 +1274,7 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start )
2039     for( int i = start; i < h->out.i_nal; i++ )
2040     {
2041         int long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS;
2042-        int size = x264_nal_encode( nal_buffer, &h->out.nal[i], h->param.b_annexb, long_startcode );
2043+        int size = x264_nal_encode( h, nal_buffer, &h->out.nal[i], long_startcode );
2044         h->out.nal[i].i_payload = size;
2045         h->out.nal[i].p_payload = nal_buffer;
2046         nal_buffer += size;
2047diff --git a/tools/checkasm.c b/tools/checkasm.c
2048index a0a9d54..ea6f209 100644
2049--- a/tools/checkasm.c
2050+++ b/tools/checkasm.c
2051@@ -1661,6 +1661,55 @@ static int check_cabac( int cpu_ref, int cpu_new )
2052     return ret;
2053 }
2054 
2055+static int check_bitstream( int cpu_ref, int cpu_new )
2056+{
2057+    x264_bitstream_function_t bs_c;
2058+    x264_bitstream_function_t bs_ref;
2059+    x264_bitstream_function_t bs_a;
2060+
2061+    int ret = 0, ok = 1, used_asm = 0;
2062+
2063+    x264_bitstream_init( 0, &bs_c );
2064+    x264_bitstream_init( cpu_ref, &bs_ref );
2065+    x264_bitstream_init( cpu_new, &bs_a );
2066+    if( bs_a.nal_escape != bs_ref.nal_escape )
2067+    {
2068+        int size = 0x4000;
2069+        uint8_t *input = malloc(size+100);
2070+        uint8_t *output1 = malloc(size*2);
2071+        uint8_t *output2 = malloc(size*2);
2072+        used_asm = 1;
2073+        set_func_name( "nal_escape" );
2074+        for( int i = 0; i < 100; i++ )
2075+        {
2076+            /* Test corner-case sizes */
2077+            int test_size = i < 10 ? i+1 : rand() & 0x3fff;
2078+            for( int j = 0; j < test_size; j++ )
2079+                input[j] = (rand()&1) * rand();
2080+            uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
2081+            uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
2082+            int size_c = end_c-output1;
2083+            int size_a = end_a-output2;
2084+            if( size_c != size_a || memcmp( output1, output2, size_c ) )
2085+            {
2086+                fprintf( stderr, "nal_escape :  [FAILED] %d %d\n", size_c, size_a );
2087+                ok = 0;
2088+                break;
2089+            }
2090+        }
2091+        for( int j = 0; j < size; j++ )
2092+            input[j] = rand();
2093+        call_c2( bs_c.nal_escape, output1, input, input+size );
2094+        call_a2( bs_a.nal_escape, output2, input, input+size );
2095+        free(input);
2096+        free(output1);
2097+        free(output2);
2098+    }
2099+    report( "nal escape:" );
2100+
2101+    return ret;
2102+}
2103+
2104 static int check_all_funcs( int cpu_ref, int cpu_new )
2105 {
2106     return check_pixel( cpu_ref, cpu_new )
2107@@ -1669,7 +1718,8 @@ static int check_all_funcs( int cpu_ref, int cpu_new )
2108          + check_intra( cpu_ref, cpu_new )
2109          + check_deblock( cpu_ref, cpu_new )
2110          + check_quant( cpu_ref, cpu_new )
2111-         + check_cabac( cpu_ref, cpu_new );
2112+         + check_cabac( cpu_ref, cpu_new )
2113+         + check_bitstream( cpu_ref, cpu_new );
2114 }
2115 
2116 static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
2117-- 
21181.7.0.4
2119
2120
2121From 9efc381b344f784285e10cf6a836f9efdf1035b8 Mon Sep 17 00:00:00 2001
2122From: Jason Garrett-Glaser <darkshikari@gmail.com>
2123Date: Fri, 28 May 2010 14:27:22 -0700
2124Subject: [PATCH 11/11] Add API tool to apply arbitrary quantizer offsets
2125 The calling application can now pass a "map" of quantizer offsets to apply to each frame.
2126 An optional callback to free the map can also be included.
2127 This allows all kinds of flexible region-of-interest coding and similar.
2128
2129---
2130 common/common.c       |    2 +-
2131 encoder/encoder.c     |    7 +++++--
2132 encoder/ratecontrol.c |   36 +++++++++++++++++++++++++-----------
2133 encoder/ratecontrol.h |    4 ++--
2134 x264.h                |   20 +++++++++++++++++++-
2135 5 files changed, 52 insertions(+), 17 deletions(-)
2136
2137diff --git a/common/common.c b/common/common.c
2138index 2458f65..48e1bbc 100644
2139--- a/common/common.c
2140+++ b/common/common.c
2141@@ -998,6 +998,7 @@ static void x264_log_default( void *p_unused, int i_level, const char *psz_fmt,
2142  ****************************************************************************/
2143 int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
2144 {
2145+    memset( pic, 0, sizeof( x264_picture_t ) );
2146     pic->i_type = X264_TYPE_AUTO;
2147     pic->i_qpplus1 = 0;
2148     pic->img.i_csp = i_csp;
2149@@ -1010,7 +1011,6 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
2150     pic->img.i_stride[0] = i_width;
2151     pic->img.i_stride[1] = i_width / 2;
2152     pic->img.i_stride[2] = i_width / 2;
2153-    pic->param = NULL;
2154     pic->i_pic_struct = PIC_STRUCT_AUTO;
2155     return 0;
2156 }
2157diff --git a/encoder/encoder.c b/encoder/encoder.c
2158index 2f9e7f6..89107a3 100644
2159--- a/encoder/encoder.c
2160+++ b/encoder/encoder.c
2161@@ -2250,11 +2250,14 @@ int     x264_encoder_encode( x264_t *h,
2162 
2163         if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
2164         {
2165-            if( x264_macroblock_tree_read( h, fenc ) )
2166+            if( x264_macroblock_tree_read( h, fenc, pic_in->prop.quant_offsets ) )
2167                 return -1;
2168         }
2169         else
2170-            x264_adaptive_quant_frame( h, fenc );
2171+            x264_adaptive_quant_frame( h, fenc, pic_in->prop.quant_offsets );
2172+
2173+        if( pic_in->prop.quant_offsets_free )
2174+            pic_in->prop.quant_offsets_free( pic_in->prop.quant_offsets );
2175 
2176         if( h->frames.b_have_lowres )
2177             x264_frame_init_lowres( h, fenc );
2178diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
2179index bf0a400..d09de98 100644
2180--- a/encoder/ratecontrol.c
2181+++ b/encoder/ratecontrol.c
2182@@ -235,7 +235,7 @@ static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_
2183     return var;
2184 }
2185 
2186-void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
2187+void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets )
2188 {
2189     /* constants chosen to result in approximately the same overall bitrate as without AQ.
2190      * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
2191@@ -256,11 +256,22 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
2192         /* Need to init it anyways for MB tree */
2193         if( h->param.rc.f_aq_strength == 0 )
2194         {
2195-            memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
2196-            memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
2197-            if( h->frames.b_have_lowres )
2198+            if( quant_offsets )
2199+            {
2200                 for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
2201-                    frame->i_inv_qscale_factor[mb_xy] = 256;
2202+                    frame->f_qp_offset[mb_xy] = frame->f_qp_offset_aq[mb_xy] = quant_offsets[mb_xy];
2203+                if( h->frames.b_have_lowres )
2204+                    for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
2205+                        frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8( frame->f_qp_offset[mb_xy] );
2206+            }
2207+            else
2208+            {
2209+                memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
2210+                memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
2211+                if( h->frames.b_have_lowres )
2212+                    for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
2213+                        frame->i_inv_qscale_factor[mb_xy] = 256;
2214+            }
2215         }
2216         /* Need variance data for weighted prediction */
2217         if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
2218@@ -299,9 +310,10 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
2219             for( int mb_x = 0; mb_x < width; mb_x++ )
2220             {
2221                 float qp_adj;
2222+                int mb_xy = mb_x + mb_y*h->mb.i_mb_stride;
2223                 if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
2224                 {
2225-                    qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
2226+                    qp_adj = frame->f_qp_offset[mb_xy];
2227                     qp_adj = strength * (qp_adj - avg_adj);
2228                 }
2229                 else
2230@@ -309,10 +321,12 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
2231                     uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
2232                     qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
2233                 }
2234-                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
2235-                frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
2236+                if( quant_offsets )
2237+                    qp_adj += quant_offsets[mb_xy];
2238+                frame->f_qp_offset[mb_xy] =
2239+                frame->f_qp_offset_aq[mb_xy] = qp_adj;
2240                 if( h->frames.b_have_lowres )
2241-                    frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
2242+                    frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8(qp_adj);
2243             }
2244     }
2245 
2246@@ -327,7 +341,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
2247     }
2248 }
2249 
2250-int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
2251+int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets )
2252 {
2253     x264_ratecontrol_t *rc = h->rc;
2254     uint8_t i_type_actual = rc->entry[frame->i_frame].pict_type;
2255@@ -363,7 +377,7 @@ int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
2256         rc->qpbuf_pos--;
2257     }
2258     else
2259-        x264_adaptive_quant_frame( h, frame );
2260+        x264_adaptive_quant_frame( h, frame, quant_offsets );
2261     return 0;
2262 fail:
2263     x264_log(h, X264_LOG_ERROR, "Incomplete MB-tree stats file.\n");
2264diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
2265index e052b2a..dd139eb 100644
2266--- a/encoder/ratecontrol.h
2267+++ b/encoder/ratecontrol.h
2268@@ -29,8 +29,8 @@ void x264_ratecontrol_delete( x264_t * );
2269 
2270 void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
2271 
2272-void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
2273-int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
2274+void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets );
2275+int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets );
2276 int  x264_reference_build_list_optimal( x264_t *h );
2277 void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
2278 void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead );
2279diff --git a/x264.h b/x264.h
2280index 95efd88..a4b3400 100644
2281--- a/x264.h
2282+++ b/x264.h
2283@@ -35,7 +35,7 @@
2284 
2285 #include <stdarg.h>
2286 
2287-#define X264_BUILD 96
2288+#define X264_BUILD 97
2289 
2290 /* x264_t:
2291  *      opaque handler for encoder */
2292@@ -508,6 +508,22 @@ typedef struct
2293 
2294 typedef struct
2295 {
2296+    /* In: an array of quantizer offsets to be applied to this image during encoding.
2297+     *     These are added on top of the decisions made by x264.
2298+     *     Offsets can be fractional; they are added before QPs are rounded to integer.
2299+     *     Adaptive quantization must be enabled to use this feature.  Behavior if quant
2300+     *     offsets differ between encoding passes is undefined.
2301+     *
2302+     *     Array contains one offset per macroblock, in raster scan order.  In interlaced
2303+     *     mode, top-field MBs and bottom-field MBs are interleaved at the row level. */
2304+    float *quant_offsets;
2305+    /* In: optional callback to free quant_offsets when used.
2306+     *     Useful if one wants to use a different quant_offset array for each frame. */
2307+    void (*quant_offsets_free)( void* );
2308+} x264_image_properties_t;
2309+
2310+typedef struct
2311+{
2312     /* In: force picture type (if not auto)
2313      *     If x264 encoding parameters are violated in the forcing of picture types,
2314      *     x264 will correct the input picture type and log a warning.
2315@@ -537,6 +553,8 @@ typedef struct
2316     x264_param_t *param;
2317     /* In: raw data */
2318     x264_image_t img;
2319+    /* In: optional information to modify encoder decisions for this frame */
2320+    x264_image_properties_t prop;
2321     /* Out: HRD timing information. Output only when i_nal_hrd is set. */
2322     x264_hrd_t hrd_timing;
2323     /* private user data. libx264 doesn't touch this,
2324-- 
23251.7.0.4