ddwTyPgr

· 8 years ago · May 28, 2017, 11:36 AM
1From 6ec28a907abee4ebb86c68e404cfe20483e1a128 Mon Sep 17 00:00:00 2001
2From: Jason Garrett-Glaser <darkshikari@gmail.com>
3Date: Wed, 26 May 2010 12:55:35 -0700
4Subject: [PATCH 1/8] Merge some of adaptive quant and weightp
5 Eliminate redundant work; both of them were calculating variance of the frame.
6
7---
8 common/frame.h        |    4 +-
9 encoder/analyse.h     |    1 -
10 encoder/encoder.c     |   12 ++---
11 encoder/ratecontrol.c |  124 +++++++++++++++++++++++++++++++-----------------
12 encoder/slicetype.c   |   31 ++----------
13 5 files changed, 92 insertions(+), 80 deletions(-)
14
15diff --git a/common/frame.h b/common/frame.h
16index 91d27b5..ca5cb7a 100644
17--- a/common/frame.h
18+++ b/common/frame.h
19@@ -118,8 +118,8 @@ typedef struct x264_frame
20     uint16_t *i_inv_qscale_factor;
21     int     b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
22     float   f_weighted_cost_delta[X264_BFRAME_MAX+2];
23-    uint32_t i_pixel_sum;
24-    uint64_t i_pixel_ssd;
25+    uint32_t i_pixel_sum[3];
26+    uint64_t i_pixel_ssd[3];
27 
28     /* hrd */
29     x264_hrd_t hrd_timing;
30diff --git a/encoder/analyse.h b/encoder/analyse.h
31index 7c2c22c..53e4c2e 100644
32--- a/encoder/analyse.h
33+++ b/encoder/analyse.h
34@@ -33,7 +33,6 @@ void x264_slicetype_decide( x264_t *h );
35 void x264_slicetype_analyse( x264_t *h, int keyframe );
36 
37 int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
38-void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame );
39 
40 int  x264_lookahead_init( x264_t *h, int i_slicetype_length );
41 int  x264_lookahead_is_empty( x264_t *h );
42diff --git a/encoder/encoder.c b/encoder/encoder.c
43index 52017ff..6e0dc54 100644
44--- a/encoder/encoder.c
45+++ b/encoder/encoder.c
46@@ -2246,21 +2246,17 @@ int     x264_encoder_encode( x264_t *h,
47                 fenc->i_pic_struct = PIC_STRUCT_PROGRESSIVE;
48         }
49 
50-        if( h->frames.b_have_lowres )
51-        {
52-            if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
53-                x264_weight_plane_analyse( h, fenc );
54-            x264_frame_init_lowres( h, fenc );
55-        }
56-
57         if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
58         {
59             if( x264_macroblock_tree_read( h, fenc ) )
60                 return -1;
61         }
62-        else if( h->param.rc.i_aq_mode )
63+        else
64             x264_adaptive_quant_frame( h, fenc );
65 
66+        if( h->frames.b_have_lowres )
67+            x264_frame_init_lowres( h, fenc );
68+
69         /* 2: Place the frame into the queue for its slice type decision */
70         x264_lookahead_put_frame( h, fenc );
71 
72diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
73index a725a24..bf0a400 100644
74--- a/encoder/ratecontrol.c
75+++ b/encoder/ratecontrol.c
76@@ -215,12 +215,14 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
77     stride <<= h->mb.b_interlaced;
78     uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride );
79     uint32_t sum = (uint32_t)res;
80-    uint32_t sqr = res >> 32;
81-    return sqr - (sum * sum >> shift);
82+    uint32_t ssd = res >> 32;
83+    frame->i_pixel_sum[i] += sum;
84+    frame->i_pixel_ssd[i] += ssd;
85+    return ssd - (sum * sum >> shift);
86 }
87 
88 // Find the total AC energy of the block in all planes.
89-static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
90+static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
91 {
92     /* This function contains annoying hacks because GCC has a habit of reordering emms
93      * and putting it after floating point ops.  As a result, we put the emms at the end of the
94@@ -239,56 +241,90 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
95      * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
96     float strength;
97     float avg_adj = 0.f;
98-    /* Need to init it anyways for MB tree. */
99-    if( h->param.rc.f_aq_strength == 0 )
100-    {
101-        memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
102-        memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
103-        if( h->frames.b_have_lowres )
104-            for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
105-                frame->i_inv_qscale_factor[mb_xy] = 256;
106-        return;
107+    int width = h->sps->i_mb_width;
108+    int height = h->sps->i_mb_height;
109+    /* Initialize frame stats */
110+    for( int i = 0; i < 3; i++ )
111+    {
112+        frame->i_pixel_sum[i] = 0;
113+        frame->i_pixel_ssd[i] = 0;
114     }
115 
116-    if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
117+    /* Degenerate cases */
118+    if( h->param.rc.i_aq_mode == X264_AQ_NONE || h->param.rc.f_aq_strength == 0 )
119     {
120-        float avg_adj_pow2 = 0.f;
121-        for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
122-            for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
123-            {
124-                uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
125-                float qp_adj = powf( energy + 1, 0.125f );
126-                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
127-                avg_adj += qp_adj;
128-                avg_adj_pow2 += qp_adj * qp_adj;
129-            }
130-        avg_adj /= h->mb.i_mb_count;
131-        avg_adj_pow2 /= h->mb.i_mb_count;
132-        strength = h->param.rc.f_aq_strength * avg_adj;
133-        avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
134+        /* Need to init it anyways for MB tree */
135+        if( h->param.rc.f_aq_strength == 0 )
136+        {
137+            memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
138+            memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
139+            if( h->frames.b_have_lowres )
140+                for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
141+                    frame->i_inv_qscale_factor[mb_xy] = 256;
142+        }
143+        /* Need variance data for weighted prediction */
144+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
145+        {
146+            for( int mb_y = 0; mb_y < height; mb_y++ )
147+                for( int mb_x = 0; mb_x < width; mb_x++ )
148+                    x264_ac_energy_mb( h, mb_x, mb_y, frame );
149+        }
150+        else
151+            return;
152     }
153+    /* Actual adaptive quantization */
154     else
155-        strength = h->param.rc.f_aq_strength * 1.0397f;
156-
157-    for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
158-        for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
159+    {
160+        if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
161         {
162-            float qp_adj;
163-            if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
164-            {
165-                qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
166-                qp_adj = strength * (qp_adj - avg_adj);
167-            }
168-            else
169+            float avg_adj_pow2 = 0.f;
170+            for( int mb_y = 0; mb_y < height; mb_y++ )
171+                for( int mb_x = 0; mb_x < width; mb_x++ )
172+                {
173+                    uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
174+                    float qp_adj = powf( energy + 1, 0.125f );
175+                    frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
176+                    avg_adj += qp_adj;
177+                    avg_adj_pow2 += qp_adj * qp_adj;
178+                }
179+            avg_adj /= h->mb.i_mb_count;
180+            avg_adj_pow2 /= h->mb.i_mb_count;
181+            strength = h->param.rc.f_aq_strength * avg_adj;
182+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
183+        }
184+        else
185+            strength = h->param.rc.f_aq_strength * 1.0397f;
186+
187+        for( int mb_y = 0; mb_y < height; mb_y++ )
188+            for( int mb_x = 0; mb_x < width; mb_x++ )
189             {
190-                uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
191-                qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
192+                float qp_adj;
193+                if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
194+                {
195+                    qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
196+                    qp_adj = strength * (qp_adj - avg_adj);
197+                }
198+                else
199+                {
200+                    uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
201+                    qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
202+                }
203+                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
204+                frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
205+                if( h->frames.b_have_lowres )
206+                    frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
207             }
208-            frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
209-            frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
210-            if( h->frames.b_have_lowres )
211-                frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
212-        }
213+    }
214+
215+    /* Remove mean from SSD calculation */
216+    for( int i = 0; i < 3; i++ )
217+    {
218+        uint64_t ssd = frame->i_pixel_ssd[i];
219+        uint64_t sum = frame->i_pixel_sum[i];
220+        int w = width*16>>!!i;
221+        int h = height*16>>!!i;
222+        frame->i_pixel_ssd[i] = ssd - (sum * sum + w * h / 2) / (w * h);
223+    }
224 }
225 
226 int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
227diff --git a/encoder/slicetype.c b/encoder/slicetype.c
228index 9352367..e454e12 100644
229--- a/encoder/slicetype.c
230+++ b/encoder/slicetype.c
231@@ -67,25 +67,6 @@ static void x264_weight_get_h264( unsigned int weight_nonh264, int offset, x264_
232     w->i_scale = X264_MIN( w->i_scale, 127 );
233 }
234 
235-void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame )
236-{
237-    uint32_t sad = 0;
238-    uint64_t ssd = 0;
239-    uint8_t *p = frame->plane[0];
240-    int stride = frame->i_stride[0];
241-    int width = frame->i_width[0];
242-    int height = frame->i_lines[0];
243-    for( int y = 0; y < height>>4; y++, p += stride*16 )
244-        for( int x = 0; x < width; x += 16 )
245-        {
246-            uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride );
247-            sad += (uint32_t)res;
248-            ssd += res >> 32;
249-        }
250-    frame->i_pixel_sum = sad;
251-    frame->i_pixel_ssd = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
252-}
253-
254 static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, uint8_t *dest )
255 {
256     int ref0_distance = fenc->i_frame - ref->i_frame - 1;
257@@ -167,10 +148,10 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
258     int found;
259     x264_weight_t *weights = fenc->weight[0];
260 
261-    fenc_var = round( sqrt( fenc->i_pixel_ssd ) );
262-    ref_var  = round( sqrt(  ref->i_pixel_ssd ) );
263-    fenc_mean = (float)fenc->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
264-    ref_mean  = (float) ref->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
265+    fenc_var = round( sqrt( fenc->i_pixel_ssd[0] ) );
266+    ref_var  = round( sqrt(  ref->i_pixel_ssd[0] ) );
267+    fenc_mean = (float)fenc->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
268+    ref_mean  = (float) ref->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
269 
270     //early termination
271     if( fabs( ref_mean - fenc_mean ) < 0.5 && fabs( 1 - fenc_var / ref_var ) < epsilon )
272@@ -534,8 +515,8 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
273         do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
274         if( do_search[0] )
275         {
276-            if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART
277-                  || h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
278+            if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ||
279+                  h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
280             {
281                 x264_emms();
282                 x264_weights_analyse( h, frames[b], frames[p0], 1 );
283-- 
2841.7.0.4
285
286
287From 3b40a04ffdd7b6d5a69b3c5dc29f1e727f314496 Mon Sep 17 00:00:00 2001
288From: Jason Garrett-Glaser <darkshikari@gmail.com>
289Date: Thu, 27 May 2010 10:42:15 -0700
290Subject: [PATCH 2/8] Add fast skip in lookahead motion search
291 Helps speed very significantly on motionless blocks.
292
293---
294 encoder/slicetype.c |   16 +++++++++++++++-
295 1 files changed, 15 insertions(+), 1 deletions(-)
296
297diff --git a/encoder/slicetype.c b/encoder/slicetype.c
298index e454e12..d7cfe5c 100644
299--- a/encoder/slicetype.c
300+++ b/encoder/slicetype.c
301@@ -379,11 +379,25 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
302                 CP32( m[l].mvp, mvc[0] );
303             else
304                 x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );
305-            x264_me_search( h, &m[l], mvc, i_mvc );
306 
307+            /* Fast skip for cases of near-zero residual.  Shortcut: don't bother except in the mv0 case,
308+             * since anything else is likely to have enough residual to not trigger the skip. */
309+            if( !M32( m[l].mvp ) )
310+            {
311+                m[l].cost = h->pixf.mbcmp[PIXEL_8x8]( m[l].p_fenc[0], FENC_STRIDE, m[l].p_fref[0], m[l].i_stride[0] );
312+                if( m[l].cost < 64 )
313+                {
314+                    M32( m[l].mv ) = 0;
315+                    goto skip_motionest;
316+                }
317+            }
318+
319+            x264_me_search( h, &m[l], mvc, i_mvc );
320             m[l].cost -= 2; // remove mvcost from skip mbs
321             if( M32( m[l].mv ) )
322                 m[l].cost += 5;
323+
324+skip_motionest:
325             CP32( fenc_mvs[l], m[l].mv );
326             *fenc_costs[l] = m[l].cost;
327         }
328-- 
3291.7.0.4
330
331
332From 77ec5d11f0b22035f836f8451d568ecb3e1236e6 Mon Sep 17 00:00:00 2001
333From: Jason Garrett-Glaser <darkshikari@gmail.com>
334Date: Thu, 27 May 2010 12:31:41 -0700
335Subject: [PATCH 3/8] Fix omission in libx264 tuning documentation
336
337---
338 x264.h |    2 +-
339 1 files changed, 1 insertions(+), 1 deletions(-)
340
341diff --git a/x264.h b/x264.h
342index 6d7b703..95efd88 100644
343--- a/x264.h
344+++ b/x264.h
345@@ -446,7 +446,7 @@ static const char * const x264_tune_names[] = { "film", "animation", "grain", "s
346 
347 /*      Multiple tunings can be used if separated by a delimiter in ",./-+",
348  *      however multiple psy tunings cannot be used.
349- *      film, animation, grain, psnr, and ssim are psy tunings.
350+ *      film, animation, grain, stillimage, psnr, and ssim are psy tunings.
351  *
352  *      returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
353 int     x264_param_default_preset( x264_param_t *, const char *preset, const char *tune );
354-- 
3551.7.0.4
356
357
358From bec048110f55c197aeaa6aa506952ef071a2558d Mon Sep 17 00:00:00 2001
359From: Jason Garrett-Glaser <darkshikari@gmail.com>
360Date: Thu, 27 May 2010 14:27:32 -0700
361Subject: [PATCH 4/8] x86 assembly code for NAL escaping
362 Up to ~10x faster than C depending on CPU.
363 Helps the most at very high bitrates (e.g. lossless).
364 Also make the C code faster and simpler.
365
366---
367 Makefile                   |    4 +-
368 common/bitstream.c         |   92 ++++++++++++++
369 common/bitstream.h         |  299 ++++++++++++++++++++++++++++++++++++++++++++
370 common/bs.h                |  291 ------------------------------------------
371 common/common.c            |   54 --------
372 common/common.h            |    5 +-
373 common/x86/bitstream-a.asm |  112 +++++++++++++++++
374 common/x86/deblock-a.asm   |    1 +
375 encoder/encoder.c          |    3 +-
376 tools/checkasm.c           |   52 ++++++++-
377 10 files changed, 561 insertions(+), 352 deletions(-)
378 create mode 100644 common/bitstream.c
379 create mode 100644 common/bitstream.h
380 delete mode 100644 common/bs.h
381 create mode 100644 common/x86/bitstream-a.asm
382
383diff --git a/Makefile b/Makefile
384index 0b43a3e..519e181 100644
385--- a/Makefile
386+++ b/Makefile
387@@ -8,7 +8,7 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
388        common/frame.c common/dct.c common/cpu.c common/cabac.c \
389        common/common.c common/mdate.c common/rectangle.c \
390        common/set.c common/quant.c common/deblock.c common/vlc.c \
391-       common/mvpred.c \
392+       common/mvpred.c common/bitstream.c \
393        encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
394        encoder/set.c encoder/macroblock.c encoder/cabac.c \
395        encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
396@@ -52,7 +52,7 @@ endif
397 ifneq ($(AS),)
398 X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
399           mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
400-          cpu-a.asm dct-32.asm
401+          cpu-a.asm dct-32.asm bitstream-a.asm
402 X86SRC = $(X86SRC0:%=common/x86/%)
403 
404 ifeq ($(ARCH),X86)
405diff --git a/common/bitstream.c b/common/bitstream.c
406new file mode 100644
407index 0000000..0aaac21
408--- /dev/null
409+++ b/common/bitstream.c
410@@ -0,0 +1,92 @@
411+/*****************************************************************************
412+ * bitstream.c: h264 encoder library
413+ *****************************************************************************
414+ * Copyright (C) 2010 x264 project
415+ *
416+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
417+ *          Jason Garrett-Glaser <darkshikari@gmail.com>
418+ *
419+ * This program is free software; you can redistribute it and/or modify
420+ * it under the terms of the GNU General Public License as published by
421+ * the Free Software Foundation; either version 2 of the License, or
422+ * (at your option) any later version.
423+ *
424+ * This program is distributed in the hope that it will be useful,
425+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
426+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
427+ * GNU General Public License for more details.
428+ *
429+ * You should have received a copy of the GNU General Public License
430+ * along with this program; if not, write to the Free Software
431+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
432+ *****************************************************************************/
433+
434+#include "common.h"
435+
436+static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
437+{
438+    if( src < end ) *dst++ = *src++;
439+    if( src < end ) *dst++ = *src++;
440+    while( src < end )
441+    {
442+        if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
443+            *dst++ = 0x03;
444+        *dst++ = *src++;
445+    }
446+    return dst;
447+}
448+
449+#ifdef HAVE_MMX
450+uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
451+uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
452+#endif
453+
454+/****************************************************************************
455+ * x264_nal_encode:
456+ ****************************************************************************/
457+int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode )
458+{
459+    uint8_t *src = nal->p_payload;
460+    uint8_t *end = nal->p_payload + nal->i_payload;
461+    uint8_t *orig_dst = dst;
462+
463+    if( h->param.b_annexb )
464+    {
465+        if( b_long_startcode )
466+            *dst++ = 0x00;
467+        *dst++ = 0x00;
468+        *dst++ = 0x00;
469+        *dst++ = 0x01;
470+    }
471+    else /* save room for size later */
472+        dst += 4;
473+
474+    /* nal header */
475+    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
476+
477+    dst = h->bsf.nal_escape( dst, src, end );
478+    int size = (dst - orig_dst) - 4;
479+
480+    /* Write the size header for mp4/etc */
481+    if( !h->param.b_annexb )
482+    {
483+        /* Size doesn't include the size of the header we're writing now. */
484+        orig_dst[0] = size>>24;
485+        orig_dst[1] = size>>16;
486+        orig_dst[2] = size>> 8;
487+        orig_dst[3] = size>> 0;
488+    }
489+
490+    return size+4;
491+}
492+
493+void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
494+{
495+    pf->nal_escape = x264_nal_escape_c;
496+#ifdef HAVE_MMX
497+    if( cpu&X264_CPU_MMXEXT )
498+        pf->nal_escape = x264_nal_escape_mmxext;
499+    if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
500+        pf->nal_escape = x264_nal_escape_sse2;
501+#endif
502+}
503diff --git a/common/bitstream.h b/common/bitstream.h
504new file mode 100644
505index 0000000..d018c7d
506--- /dev/null
507+++ b/common/bitstream.h
508@@ -0,0 +1,299 @@
509+/*****************************************************************************
510+ * bitstream.h: h264 encoder library
511+ *****************************************************************************
512+ * Copyright (C) 2003-2008 x264 project
513+ *
514+ * Authors: Loren Merritt <lorenm@u.washington.edu>
515+ *          Jason Garrett-Glaser <darkshikari@gmail.com>
516+ *          Laurent Aimar <fenrir@via.ecp.fr>
517+ *
518+ * This program is free software; you can redistribute it and/or modify
519+ * it under the terms of the GNU General Public License as published by
520+ * the Free Software Foundation; either version 2 of the License, or
521+ * (at your option) any later version.
522+ *
523+ * This program is distributed in the hope that it will be useful,
524+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
525+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
526+ * GNU General Public License for more details.
527+ *
528+ * You should have received a copy of the GNU General Public License
529+ * along with this program; if not, write to the Free Software
530+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
531+ *****************************************************************************/
532+
533+#ifndef X264_BS_H
534+#define X264_BS_H
535+
536+typedef struct
537+{
538+    uint8_t i_bits;
539+    uint8_t i_size;
540+} vlc_t;
541+
542+typedef struct
543+{
544+    uint16_t i_bits;
545+    uint8_t  i_size;
546+    /* Next level table to use */
547+    uint8_t  i_next;
548+} vlc_large_t;
549+
550+typedef struct bs_s
551+{
552+    uint8_t *p_start;
553+    uint8_t *p;
554+    uint8_t *p_end;
555+
556+    intptr_t cur_bits;
557+    int     i_left;    /* i_count number of available bits */
558+    int     i_bits_encoded; /* RD only */
559+} bs_t;
560+
561+typedef struct
562+{
563+    int     last;
564+    int16_t level[16];
565+    uint8_t run[16];
566+} x264_run_level_t;
567+
568+extern const vlc_t x264_coeff0_token[5];
569+extern const vlc_t x264_coeff_token[5][16][4];
570+extern const vlc_t x264_total_zeros[15][16];
571+extern const vlc_t x264_total_zeros_dc[3][4];
572+extern const vlc_t x264_run_before[7][16];
573+
574+typedef struct
575+{
576+    uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end );
577+} x264_bitstream_function_t;
578+
579+int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode );
580+void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
581+
582+/* A larger level table size theoretically could help a bit at extremely
583+ * high bitrates, but the cost in cache is usually too high for it to be
584+ * useful.
585+ * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
586+ * FIXME: Do further testing? */
587+#define LEVEL_TABLE_SIZE 128
588+extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
589+
590+static inline void bs_init( bs_t *s, void *p_data, int i_data )
591+{
592+    int offset = ((intptr_t)p_data & 3);
593+    s->p       = s->p_start = (uint8_t*)p_data - offset;
594+    s->p_end   = (uint8_t*)p_data + i_data;
595+    s->i_left  = (WORD_SIZE - offset)*8;
596+    s->cur_bits = endian_fix32( M32(s->p) );
597+    s->cur_bits >>= (4-offset)*8;
598+}
599+static inline int bs_pos( bs_t *s )
600+{
601+    return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
602+}
603+
604+/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
605+static inline void bs_flush( bs_t *s )
606+{
607+    M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
608+    s->p += WORD_SIZE - s->i_left / 8;
609+    s->i_left = WORD_SIZE*8;
610+}
611+/* The inverse of bs_flush: prepare the bitstream to be written to again. */
612+static inline void bs_realign( bs_t *s )
613+{
614+    int offset = ((intptr_t)s->p & 3);
615+    if( offset )
616+    {
617+        s->p       = (uint8_t*)s->p - offset;
618+        s->i_left  = (WORD_SIZE - offset)*8;
619+        s->cur_bits = endian_fix32( M32(s->p) );
620+        s->cur_bits >>= (4-offset)*8;
621+    }
622+}
623+
624+static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
625+{
626+    if( WORD_SIZE == 8 )
627+    {
628+        s->cur_bits = (s->cur_bits << i_count) | i_bits;
629+        s->i_left -= i_count;
630+        if( s->i_left <= 32 )
631+        {
632+#ifdef WORDS_BIGENDIAN
633+            M32( s->p ) = s->cur_bits >> (32 - s->i_left);
634+#else
635+            M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
636+#endif
637+            s->i_left += 32;
638+            s->p += 4;
639+        }
640+    }
641+    else
642+    {
643+        if( i_count < s->i_left )
644+        {
645+            s->cur_bits = (s->cur_bits << i_count) | i_bits;
646+            s->i_left -= i_count;
647+        }
648+        else
649+        {
650+            i_count -= s->i_left;
651+            s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
652+            M32( s->p ) = endian_fix( s->cur_bits );
653+            s->p += 4;
654+            s->cur_bits = i_bits;
655+            s->i_left = 32 - i_count;
656+        }
657+    }
658+}
659+
660+/* Special case to eliminate branch in normal bs_write. */
661+/* Golomb never writes an even-size code, so this is only used in slice headers. */
662+static inline void bs_write32( bs_t *s, uint32_t i_bits )
663+{
664+    bs_write( s, 16, i_bits >> 16 );
665+    bs_write( s, 16, i_bits );
666+}
667+
668+static inline void bs_write1( bs_t *s, uint32_t i_bit )
669+{
670+    s->cur_bits <<= 1;
671+    s->cur_bits |= i_bit;
672+    s->i_left--;
673+    if( s->i_left == WORD_SIZE*8-32 )
674+    {
675+        M32( s->p ) = endian_fix32( s->cur_bits );
676+        s->p += 4;
677+        s->i_left = WORD_SIZE*8;
678+    }
679+}
680+
681+static inline void bs_align_0( bs_t *s )
682+{
683+    bs_write( s, s->i_left&7, 0 );
684+    bs_flush( s );
685+}
686+static inline void bs_align_1( bs_t *s )
687+{
688+    bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
689+    bs_flush( s );
690+}
691+static inline void bs_align_10( bs_t *s )
692+{
693+    if( s->i_left&7 )
694+        bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
695+}
696+
697+/* golomb functions */
698+
699+static const uint8_t x264_ue_size_tab[256] =
700+{
701+     1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
702+     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
703+    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
704+    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
705+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
706+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
707+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
708+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
709+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
710+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
711+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
712+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
713+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
714+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
715+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
716+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
717+};
718+
719+static inline void bs_write_ue_big( bs_t *s, unsigned int val )
720+{
721+    int size = 0;
722+    int tmp = ++val;
723+    if( tmp >= 0x10000 )
724+    {
725+        size = 32;
726+        tmp >>= 16;
727+    }
728+    if( tmp >= 0x100 )
729+    {
730+        size += 16;
731+        tmp >>= 8;
732+    }
733+    size += x264_ue_size_tab[tmp];
734+    bs_write( s, size>>1, 0 );
735+    bs_write( s, (size>>1)+1, val );
736+}
737+
738+/* Only works on values under 255. */
739+static inline void bs_write_ue( bs_t *s, int val )
740+{
741+    bs_write( s, x264_ue_size_tab[val+1], val+1 );
742+}
743+
744+static inline void bs_write_se( bs_t *s, int val )
745+{
746+    int size = 0;
747+    /* Faster than (val <= 0 ? -val*2+1 : val*2) */
748+    /* 4 instructions on x86, 3 on ARM */
749+    int tmp = 1 - val*2;
750+    if( tmp < 0 ) tmp = val*2;
751+    val = tmp;
752+
753+    if( tmp >= 0x100 )
754+    {
755+        size = 16;
756+        tmp >>= 8;
757+    }
758+    size += x264_ue_size_tab[tmp];
759+    bs_write( s, size, val );
760+}
761+
762+static inline void bs_write_te( bs_t *s, int x, int val )
763+{
764+    if( x == 1 )
765+        bs_write1( s, 1^val );
766+    else //if( x > 1 )
767+        bs_write_ue( s, val );
768+}
769+
770+static inline void bs_rbsp_trailing( bs_t *s )
771+{
772+    bs_write1( s, 1 );
773+    bs_write( s, s->i_left&7, 0  );
774+}
775+
776+static ALWAYS_INLINE int bs_size_ue( unsigned int val )
777+{
778+    return x264_ue_size_tab[val+1];
779+}
780+
781+static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
782+{
783+    if( val < 255 )
784+        return x264_ue_size_tab[val+1];
785+    else
786+        return x264_ue_size_tab[(val+1)>>8] + 16;
787+}
788+
789+static ALWAYS_INLINE int bs_size_se( int val )
790+{
791+    int tmp = 1 - val*2;
792+    if( tmp < 0 ) tmp = val*2;
793+    if( tmp < 256 )
794+        return x264_ue_size_tab[tmp];
795+    else
796+        return x264_ue_size_tab[tmp>>8]+16;
797+}
798+
799+static ALWAYS_INLINE int bs_size_te( int x, int val )
800+{
801+    if( x == 1 )
802+        return 1;
803+    else //if( x > 1 )
804+        return x264_ue_size_tab[val+1];
805+}
806+
807+#endif
808diff --git a/common/bs.h b/common/bs.h
809deleted file mode 100644
810index 343a3c9..0000000
811--- a/common/bs.h
812+++ /dev/null
813@@ -1,291 +0,0 @@
814-/*****************************************************************************
815- * bs.h :
816- *****************************************************************************
817- * Copyright (C) 2003-2008 x264 project
818- *
819- * Authors: Loren Merritt <lorenm@u.washington.edu>
820- *          Jason Garrett-Glaser <darkshikari@gmail.com>
821- *          Laurent Aimar <fenrir@via.ecp.fr>
822- *
823- * This program is free software; you can redistribute it and/or modify
824- * it under the terms of the GNU General Public License as published by
825- * the Free Software Foundation; either version 2 of the License, or
826- * (at your option) any later version.
827- *
828- * This program is distributed in the hope that it will be useful,
829- * but WITHOUT ANY WARRANTY; without even the implied warranty of
830- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
831- * GNU General Public License for more details.
832- *
833- * You should have received a copy of the GNU General Public License
834- * along with this program; if not, write to the Free Software
835- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
836- *****************************************************************************/
837-
838-#ifndef X264_BS_H
839-#define X264_BS_H
840-
841-typedef struct
842-{
843-    uint8_t i_bits;
844-    uint8_t i_size;
845-} vlc_t;
846-
847-typedef struct
848-{
849-    uint16_t i_bits;
850-    uint8_t  i_size;
851-    /* Next level table to use */
852-    uint8_t  i_next;
853-} vlc_large_t;
854-
855-typedef struct bs_s
856-{
857-    uint8_t *p_start;
858-    uint8_t *p;
859-    uint8_t *p_end;
860-
861-    intptr_t cur_bits;
862-    int     i_left;    /* i_count number of available bits */
863-    int     i_bits_encoded; /* RD only */
864-} bs_t;
865-
866-typedef struct
867-{
868-    int     last;
869-    int16_t level[16];
870-    uint8_t run[16];
871-} x264_run_level_t;
872-
873-extern const vlc_t x264_coeff0_token[5];
874-extern const vlc_t x264_coeff_token[5][16][4];
875-extern const vlc_t x264_total_zeros[15][16];
876-extern const vlc_t x264_total_zeros_dc[3][4];
877-extern const vlc_t x264_run_before[7][16];
878-
879-/* A larger level table size theoretically could help a bit at extremely
880- * high bitrates, but the cost in cache is usually too high for it to be
881- * useful.
882- * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
883- * FIXME: Do further testing? */
884-#define LEVEL_TABLE_SIZE 128
885-extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
886-
887-static inline void bs_init( bs_t *s, void *p_data, int i_data )
888-{
889-    int offset = ((intptr_t)p_data & 3);
890-    s->p       = s->p_start = (uint8_t*)p_data - offset;
891-    s->p_end   = (uint8_t*)p_data + i_data;
892-    s->i_left  = (WORD_SIZE - offset)*8;
893-    s->cur_bits = endian_fix32( M32(s->p) );
894-    s->cur_bits >>= (4-offset)*8;
895-}
896-static inline int bs_pos( bs_t *s )
897-{
898-    return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
899-}
900-
901-/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
902-static inline void bs_flush( bs_t *s )
903-{
904-    M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
905-    s->p += WORD_SIZE - s->i_left / 8;
906-    s->i_left = WORD_SIZE*8;
907-}
908-/* The inverse of bs_flush: prepare the bitstream to be written to again. */
909-static inline void bs_realign( bs_t *s )
910-{
911-    int offset = ((intptr_t)s->p & 3);
912-    if( offset )
913-    {
914-        s->p       = (uint8_t*)s->p - offset;
915-        s->i_left  = (WORD_SIZE - offset)*8;
916-        s->cur_bits = endian_fix32( M32(s->p) );
917-        s->cur_bits >>= (4-offset)*8;
918-    }
919-}
920-
921-static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
922-{
923-    if( WORD_SIZE == 8 )
924-    {
925-        s->cur_bits = (s->cur_bits << i_count) | i_bits;
926-        s->i_left -= i_count;
927-        if( s->i_left <= 32 )
928-        {
929-#ifdef WORDS_BIGENDIAN
930-            M32( s->p ) = s->cur_bits >> (32 - s->i_left);
931-#else
932-            M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
933-#endif
934-            s->i_left += 32;
935-            s->p += 4;
936-        }
937-    }
938-    else
939-    {
940-        if( i_count < s->i_left )
941-        {
942-            s->cur_bits = (s->cur_bits << i_count) | i_bits;
943-            s->i_left -= i_count;
944-        }
945-        else
946-        {
947-            i_count -= s->i_left;
948-            s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
949-            M32( s->p ) = endian_fix( s->cur_bits );
950-            s->p += 4;
951-            s->cur_bits = i_bits;
952-            s->i_left = 32 - i_count;
953-        }
954-    }
955-}
956-
957-/* Special case to eliminate branch in normal bs_write. */
958-/* Golomb never writes an even-size code, so this is only used in slice headers. */
959-static inline void bs_write32( bs_t *s, uint32_t i_bits )
960-{
961-    bs_write( s, 16, i_bits >> 16 );
962-    bs_write( s, 16, i_bits );
963-}
964-
965-static inline void bs_write1( bs_t *s, uint32_t i_bit )
966-{
967-    s->cur_bits <<= 1;
968-    s->cur_bits |= i_bit;
969-    s->i_left--;
970-    if( s->i_left == WORD_SIZE*8-32 )
971-    {
972-        M32( s->p ) = endian_fix32( s->cur_bits );
973-        s->p += 4;
974-        s->i_left = WORD_SIZE*8;
975-    }
976-}
977-
978-static inline void bs_align_0( bs_t *s )
979-{
980-    bs_write( s, s->i_left&7, 0 );
981-    bs_flush( s );
982-}
983-static inline void bs_align_1( bs_t *s )
984-{
985-    bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
986-    bs_flush( s );
987-}
988-static inline void bs_align_10( bs_t *s )
989-{
990-    if( s->i_left&7 )
991-        bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
992-}
993-
994-/* golomb functions */
995-
996-static const uint8_t x264_ue_size_tab[256] =
997-{
998-     1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
999-     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
1000-    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
1001-    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
1002-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1003-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1004-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1005-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1006-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1007-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1008-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1009-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1010-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1011-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1012-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1013-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1014-};
1015-
1016-static inline void bs_write_ue_big( bs_t *s, unsigned int val )
1017-{
1018-    int size = 0;
1019-    int tmp = ++val;
1020-    if( tmp >= 0x10000 )
1021-    {
1022-        size = 32;
1023-        tmp >>= 16;
1024-    }
1025-    if( tmp >= 0x100 )
1026-    {
1027-        size += 16;
1028-        tmp >>= 8;
1029-    }
1030-    size += x264_ue_size_tab[tmp];
1031-    bs_write( s, size>>1, 0 );
1032-    bs_write( s, (size>>1)+1, val );
1033-}
1034-
1035-/* Only works on values under 255. */
1036-static inline void bs_write_ue( bs_t *s, int val )
1037-{
1038-    bs_write( s, x264_ue_size_tab[val+1], val+1 );
1039-}
1040-
1041-static inline void bs_write_se( bs_t *s, int val )
1042-{
1043-    int size = 0;
1044-    /* Faster than (val <= 0 ? -val*2+1 : val*2) */
1045-    /* 4 instructions on x86, 3 on ARM */
1046-    int tmp = 1 - val*2;
1047-    if( tmp < 0 ) tmp = val*2;
1048-    val = tmp;
1049-
1050-    if( tmp >= 0x100 )
1051-    {
1052-        size = 16;
1053-        tmp >>= 8;
1054-    }
1055-    size += x264_ue_size_tab[tmp];
1056-    bs_write( s, size, val );
1057-}
1058-
1059-static inline void bs_write_te( bs_t *s, int x, int val )
1060-{
1061-    if( x == 1 )
1062-        bs_write1( s, 1^val );
1063-    else //if( x > 1 )
1064-        bs_write_ue( s, val );
1065-}
1066-
1067-static inline void bs_rbsp_trailing( bs_t *s )
1068-{
1069-    bs_write1( s, 1 );
1070-    bs_write( s, s->i_left&7, 0  );
1071-}
1072-
1073-static ALWAYS_INLINE int bs_size_ue( unsigned int val )
1074-{
1075-    return x264_ue_size_tab[val+1];
1076-}
1077-
1078-static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
1079-{
1080-    if( val < 255 )
1081-        return x264_ue_size_tab[val+1];
1082-    else
1083-        return x264_ue_size_tab[(val+1)>>8] + 16;
1084-}
1085-
1086-static ALWAYS_INLINE int bs_size_se( int val )
1087-{
1088-    int tmp = 1 - val*2;
1089-    if( tmp < 0 ) tmp = val*2;
1090-    if( tmp < 256 )
1091-        return x264_ue_size_tab[tmp];
1092-    else
1093-        return x264_ue_size_tab[tmp>>8]+16;
1094-}
1095-
1096-static ALWAYS_INLINE int bs_size_te( int x, int val )
1097-{
1098-    if( x == 1 )
1099-        return 1;
1100-    else //if( x > 1 )
1101-        return x264_ue_size_tab[val+1];
1102-}
1103-
1104-#endif
1105diff --git a/common/common.c b/common/common.c
1106index 62bef99..f1e8758 100644
1107--- a/common/common.c
1108+++ b/common/common.c
1109@@ -1026,60 +1026,6 @@ void x264_picture_clean( x264_picture_t *pic )
1110 }
1111 
1112 /****************************************************************************
1113- * x264_nal_encode:
1114- ****************************************************************************/
1115-int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode )
1116-{
1117-    uint8_t *src = nal->p_payload;
1118-    uint8_t *end = nal->p_payload + nal->i_payload;
1119-    uint8_t *orig_dst = dst;
1120-    int i_count = 0, size;
1121-
1122-    if( b_annexb )
1123-    {
1124-        if( b_long_startcode )
1125-            *dst++ = 0x00;
1126-        *dst++ = 0x00;
1127-        *dst++ = 0x00;
1128-        *dst++ = 0x01;
1129-    }
1130-    else /* save room for size later */
1131-        dst += 4;
1132-
1133-    /* nal header */
1134-    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
1135-
1136-    while( src < end )
1137-    {
1138-        if( i_count == 2 && *src <= 0x03 )
1139-        {
1140-            *dst++ = 0x03;
1141-            i_count = 0;
1142-        }
1143-        if( *src == 0 )
1144-            i_count++;
1145-        else
1146-            i_count = 0;
1147-        *dst++ = *src++;
1148-    }
1149-    size = (dst - orig_dst) - 4;
1150-
1151-    /* Write the size header for mp4/etc */
1152-    if( !b_annexb )
1153-    {
1154-        /* Size doesn't include the size of the header we're writing now. */
1155-        orig_dst[0] = size>>24;
1156-        orig_dst[1] = size>>16;
1157-        orig_dst[2] = size>> 8;
1158-        orig_dst[3] = size>> 0;
1159-    }
1160-
1161-    return size+4;
1162-}
1163-
1164-
1165-
1166-/****************************************************************************
1167  * x264_malloc:
1168  ****************************************************************************/
1169 void *x264_malloc( int i_size )
1170diff --git a/common/common.h b/common/common.h
1171index 539ea65..93712fe 100644
1172--- a/common/common.h
1173+++ b/common/common.h
1174@@ -137,7 +137,7 @@ static const int x264_scan8[16+2*4+3] =
1175 */
1176 
1177 #include "x264.h"
1178-#include "bs.h"
1179+#include "bitstream.h"
1180 #include "set.h"
1181 #include "predict.h"
1182 #include "pixel.h"
1183@@ -166,8 +166,6 @@ int64_t x264_mdate( void );
1184  * the encoding options */
1185 char *x264_param2string( x264_param_t *p, int b_res );
1186 
1187-int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode );
1188-
1189 /* log */
1190 void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
1191 
1192@@ -796,6 +794,7 @@ struct x264_t
1193     x264_zigzag_function_t zigzagf;
1194     x264_quant_function_t quantf;
1195     x264_deblock_function_t loopf;
1196+    x264_bitstream_function_t bsf;
1197 
1198 #ifdef HAVE_VISUALIZE
1199     struct visualize_t *visualize;
1200diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
1201new file mode 100644
1202index 0000000..1fb4cea
1203--- /dev/null
1204+++ b/common/x86/bitstream-a.asm
1205@@ -0,0 +1,112 @@
1206+;*****************************************************************************
1207+;* bitstream-a.asm: h264 encoder library
1208+;*****************************************************************************
1209+;* Copyright (C) 2010 x264 project
1210+;*
1211+;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
1212+;*
1213+;* This program is free software; you can redistribute it and/or modify
1214+;* it under the terms of the GNU General Public License as published by
1215+;* the Free Software Foundation; either version 2 of the License, or
1216+;* (at your option) any later version.
1217+;*
1218+;* This program is distributed in the hope that it will be useful,
1219+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
1220+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1221+;* GNU General Public License for more details.
1222+;*
1223+;* You should have received a copy of the GNU General Public License
1224+;* along with this program; if not, write to the Free Software
1225+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
1226+;*****************************************************************************
1227+
1228+%include "x86inc.asm"
1229+%include "x86util.asm"
1230+
1231+SECTION .text
1232+
1233+;-----------------------------------------------------------------------------
1234+; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
1235+;-----------------------------------------------------------------------------
1236+
1237+%macro NAL_LOOP 2
1238+ALIGN 16
1239+%1:
1240+    mova      m0, [r1+r2]
1241+    mova      m1, m0
1242+%if mmsize == 8
1243+    psrlq     m0, 8
1244+%else
1245+    psrldq    m0, 1
1246+%endif
1247+    %2   [r0+r1], m1
1248+    por       m1, m0
1249+    pcmpeqb   m1, m2
1250+    pmovmskb r3d, m1
1251+    test     r3d, r3d
1252+    jnz .escape
1253+    add       r1, mmsize
1254+    jl %1
1255+%endmacro
1256+
1257+%macro NAL_ESCAPE 1
1258+
1259+cglobal nal_escape_%1, 3,5
1260+    pxor      m2, m2
1261+    sub       r1, r2 ; r1 = offset of current src pointer from end of src
1262+    sub       r0, r1 ; r0 = projected end of dst, assuming no more escapes
1263+
1264+    mov      r3w, [r1+r2]
1265+    mov  [r0+r1], r3w
1266+    add       r1, 2
1267+    jge .ret
1268+
1269+    ; Start off by jumping into the escape loop in
1270+    ; case there's an escape at the start.
1271+    ; And do a few more in scalar until src is aligned again.
1272+    lea      r4d, [r1+r2]
1273+    or       r4d, -mmsize
1274+    neg      r4d
1275+    jmp .escapeloop
1276+
1277+    NAL_LOOP .loop_aligned, mova
1278+%if mmsize==16
1279+    NAL_LOOP .loop_unaligned, movu
1280+%endif
1281+
1282+.ret:
1283+    movifnidn rax, r0
1284+    RET
1285+ALIGN 16
1286+.escape:
1287+    mov      r4d, mmsize
1288+.escapeloop:
1289+    mov      r3b, [r1+r2]
1290+    cmp      r3b, 3
1291+    jna .escape_check
1292+.copy:
1293+    mov  [r0+r1], r3b
1294+    inc      r1
1295+    jge .ret
1296+    dec      r4d
1297+    jg .escapeloop
1298+    cmp byte [r1+r2-1], 0 ; Don't go back to the main loop until we're out of a zero-run.
1299+    jz .escape
1300+%if mmsize==16
1301+    lea      r4d, [r0+r1]
1302+    test     r4d, mmsize-1
1303+    jnz .loop_unaligned
1304+%endif
1305+    jmp .loop_aligned
1306+.escape_check:
1307+    cmp word [r0+r1-2], 0
1308+    jnz .copy
1309+    mov byte [r0+r1], 3
1310+    inc      r0
1311+    jmp .copy
1312+%endmacro
1313+
1314+INIT_MMX
1315+NAL_ESCAPE mmxext
1316+INIT_XMM
1317+NAL_ESCAPE sse2
1318diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
1319index aedd688..3a31e26 100644
1320--- a/common/x86/deblock-a.asm
1321+++ b/common/x86/deblock-a.asm
1322@@ -4,6 +4,7 @@
1323 ;* Copyright (C) 2005-2008 x264 project
1324 ;*
1325 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
1326+;*          Jason Garrett-Glaser <darkshikari@gmail.com>
1327 ;*
1328 ;* This program is free software; you can redistribute it and/or modify
1329 ;* it under the terms of the GNU General Public License as published by
1330diff --git a/encoder/encoder.c b/encoder/encoder.c
1331index 6e0dc54..32db82a 100644
1332--- a/encoder/encoder.c
1333+++ b/encoder/encoder.c
1334@@ -986,6 +986,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
1335     x264_mc_init( h->param.cpu, &h->mc );
1336     x264_quant_init( h, h->param.cpu, &h->quantf );
1337     x264_deblock_init( h->param.cpu, &h->loopf );
1338+    x264_bitstream_init( h->param.cpu, &h->bsf );
1339     x264_dct_init_weights();
1340 
1341     mbcmp_init( h );
1342@@ -1272,7 +1273,7 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start )
1343     for( int i = start; i < h->out.i_nal; i++ )
1344     {
1345         int long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS;
1346-        int size = x264_nal_encode( nal_buffer, &h->out.nal[i], h->param.b_annexb, long_startcode );
1347+        int size = x264_nal_encode( h, nal_buffer, &h->out.nal[i], long_startcode );
1348         h->out.nal[i].i_payload = size;
1349         h->out.nal[i].p_payload = nal_buffer;
1350         nal_buffer += size;
1351diff --git a/tools/checkasm.c b/tools/checkasm.c
1352index a0a9d54..ea6f209 100644
1353--- a/tools/checkasm.c
1354+++ b/tools/checkasm.c
1355@@ -1661,6 +1661,55 @@ static int check_cabac( int cpu_ref, int cpu_new )
1356     return ret;
1357 }
1358 
1359+static int check_bitstream( int cpu_ref, int cpu_new )
1360+{
1361+    x264_bitstream_function_t bs_c;
1362+    x264_bitstream_function_t bs_ref;
1363+    x264_bitstream_function_t bs_a;
1364+
1365+    int ret = 0, ok = 1, used_asm = 0;
1366+
1367+    x264_bitstream_init( 0, &bs_c );
1368+    x264_bitstream_init( cpu_ref, &bs_ref );
1369+    x264_bitstream_init( cpu_new, &bs_a );
1370+    if( bs_a.nal_escape != bs_ref.nal_escape )
1371+    {
1372+        int size = 0x4000;
1373+        uint8_t *input = malloc(size+100);
1374+        uint8_t *output1 = malloc(size*2);
1375+        uint8_t *output2 = malloc(size*2);
1376+        used_asm = 1;
1377+        set_func_name( "nal_escape" );
1378+        for( int i = 0; i < 100; i++ )
1379+        {
1380+            /* Test corner-case sizes */
1381+            int test_size = i < 10 ? i+1 : rand() & 0x3fff;
1382+            for( int j = 0; j < test_size; j++ )
1383+                input[j] = (rand()&1) * rand();
1384+            uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
1385+            uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
1386+            int size_c = end_c-output1;
1387+            int size_a = end_a-output2;
1388+            if( size_c != size_a || memcmp( output1, output2, size_c ) )
1389+            {
1390+                fprintf( stderr, "nal_escape :  [FAILED] %d %d\n", size_c, size_a );
1391+                ok = 0;
1392+                break;
1393+            }
1394+        }
1395+        for( int j = 0; j < size; j++ )
1396+            input[j] = rand();
1397+        call_c2( bs_c.nal_escape, output1, input, input+size );
1398+        call_a2( bs_a.nal_escape, output2, input, input+size );
1399+        free(input);
1400+        free(output1);
1401+        free(output2);
1402+    }
1403+    report( "nal escape:" );
1404+
1405+    return ret;
1406+}
1407+
1408 static int check_all_funcs( int cpu_ref, int cpu_new )
1409 {
1410     return check_pixel( cpu_ref, cpu_new )
1411@@ -1669,7 +1718,8 @@ static int check_all_funcs( int cpu_ref, int cpu_new )
1412          + check_intra( cpu_ref, cpu_new )
1413          + check_deblock( cpu_ref, cpu_new )
1414          + check_quant( cpu_ref, cpu_new )
1415-         + check_cabac( cpu_ref, cpu_new );
1416+         + check_cabac( cpu_ref, cpu_new )
1417+         + check_bitstream( cpu_ref, cpu_new );
1418 }
1419 
1420 static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
1421-- 
14221.7.0.4
1423
1424
1425From 92e968cda1b4306ae0d99024114adcd17c617637 Mon Sep 17 00:00:00 2001
1426From: Jason Garrett-Glaser <darkshikari@gmail.com>
1427Date: Fri, 28 May 2010 14:30:07 -0700
1428Subject: [PATCH 5/8] Re-enable i8x8 merged SATD
1429 Accidentally got disabled when intra_sad_x3 was added.
1430
1431---
1432 encoder/encoder.c |    1 +
1433 1 files changed, 1 insertions(+), 0 deletions(-)
1434
1435diff --git a/encoder/encoder.c b/encoder/encoder.c
1436index 32db82a..2f9e7f6 100644
1437--- a/encoder/encoder.c
1438+++ b/encoder/encoder.c
1439@@ -810,6 +810,7 @@ static void mbcmp_init( x264_t *h )
1440     memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
1441     h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
1442     h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
1443+    h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8;
1444     h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
1445     satd &= h->param.analyse.i_me_method == X264_ME_TESA;
1446     memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
1447-- 
14481.7.0.4
1449
1450
1451From 02fa45a7a2e26a885bcf6e996bec2a7ee6c242bf Mon Sep 17 00:00:00 2001
1452From: Jason Garrett-Glaser <darkshikari@gmail.com>
1453Date: Fri, 28 May 2010 14:27:22 -0700
1454Subject: [PATCH 6/8] Add API tool to apply arbitrary quantizer offsets
1455 The calling application can now pass a "map" of quantizer offsets to apply to each frame.
1456 An optional callback to free the map can also be included.
1457 This allows all kinds of flexible region-of-interest coding and similar.
1458
1459---
1460 common/common.c       |    2 +-
1461 encoder/encoder.c     |    7 +++++--
1462 encoder/ratecontrol.c |   36 +++++++++++++++++++++++++-----------
1463 encoder/ratecontrol.h |    4 ++--
1464 x264.h                |   20 +++++++++++++++++++-
1465 5 files changed, 52 insertions(+), 17 deletions(-)
1466
1467diff --git a/common/common.c b/common/common.c
1468index f1e8758..c092c01 100644
1469--- a/common/common.c
1470+++ b/common/common.c
1471@@ -997,6 +997,7 @@ static void x264_log_default( void *p_unused, int i_level, const char *psz_fmt,
1472  ****************************************************************************/
1473 int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
1474 {
1475+    memset( pic, 0, sizeof( x264_picture_t ) );
1476     pic->i_type = X264_TYPE_AUTO;
1477     pic->i_qpplus1 = 0;
1478     pic->img.i_csp = i_csp;
1479@@ -1009,7 +1010,6 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
1480     pic->img.i_stride[0] = i_width;
1481     pic->img.i_stride[1] = i_width / 2;
1482     pic->img.i_stride[2] = i_width / 2;
1483-    pic->param = NULL;
1484     pic->i_pic_struct = PIC_STRUCT_AUTO;
1485     return 0;
1486 }
1487diff --git a/encoder/encoder.c b/encoder/encoder.c
1488index 2f9e7f6..89107a3 100644
1489--- a/encoder/encoder.c
1490+++ b/encoder/encoder.c
1491@@ -2250,11 +2250,14 @@ int     x264_encoder_encode( x264_t *h,
1492 
1493         if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
1494         {
1495-            if( x264_macroblock_tree_read( h, fenc ) )
1496+            if( x264_macroblock_tree_read( h, fenc, pic_in->prop.quant_offsets ) )
1497                 return -1;
1498         }
1499         else
1500-            x264_adaptive_quant_frame( h, fenc );
1501+            x264_adaptive_quant_frame( h, fenc, pic_in->prop.quant_offsets );
1502+
1503+        if( pic_in->prop.quant_offsets_free )
1504+            pic_in->prop.quant_offsets_free( pic_in->prop.quant_offsets );
1505 
1506         if( h->frames.b_have_lowres )
1507             x264_frame_init_lowres( h, fenc );
1508diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
1509index bf0a400..d09de98 100644
1510--- a/encoder/ratecontrol.c
1511+++ b/encoder/ratecontrol.c
1512@@ -235,7 +235,7 @@ static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_
1513     return var;
1514 }
1515 
1516-void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
1517+void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets )
1518 {
1519     /* constants chosen to result in approximately the same overall bitrate as without AQ.
1520      * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
1521@@ -256,11 +256,22 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
1522         /* Need to init it anyways for MB tree */
1523         if( h->param.rc.f_aq_strength == 0 )
1524         {
1525-            memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
1526-            memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
1527-            if( h->frames.b_have_lowres )
1528+            if( quant_offsets )
1529+            {
1530                 for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
1531-                    frame->i_inv_qscale_factor[mb_xy] = 256;
1532+                    frame->f_qp_offset[mb_xy] = frame->f_qp_offset_aq[mb_xy] = quant_offsets[mb_xy];
1533+                if( h->frames.b_have_lowres )
1534+                    for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
1535+                        frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8( frame->f_qp_offset[mb_xy] );
1536+            }
1537+            else
1538+            {
1539+                memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
1540+                memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
1541+                if( h->frames.b_have_lowres )
1542+                    for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
1543+                        frame->i_inv_qscale_factor[mb_xy] = 256;
1544+            }
1545         }
1546         /* Need variance data for weighted prediction */
1547         if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
1548@@ -299,9 +310,10 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
1549             for( int mb_x = 0; mb_x < width; mb_x++ )
1550             {
1551                 float qp_adj;
1552+                int mb_xy = mb_x + mb_y*h->mb.i_mb_stride;
1553                 if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
1554                 {
1555-                    qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
1556+                    qp_adj = frame->f_qp_offset[mb_xy];
1557                     qp_adj = strength * (qp_adj - avg_adj);
1558                 }
1559                 else
1560@@ -309,10 +321,12 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
1561                     uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
1562                     qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
1563                 }
1564-                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
1565-                frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
1566+                if( quant_offsets )
1567+                    qp_adj += quant_offsets[mb_xy];
1568+                frame->f_qp_offset[mb_xy] =
1569+                frame->f_qp_offset_aq[mb_xy] = qp_adj;
1570                 if( h->frames.b_have_lowres )
1571-                    frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
1572+                    frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8(qp_adj);
1573             }
1574     }
1575 
1576@@ -327,7 +341,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
1577     }
1578 }
1579 
1580-int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
1581+int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets )
1582 {
1583     x264_ratecontrol_t *rc = h->rc;
1584     uint8_t i_type_actual = rc->entry[frame->i_frame].pict_type;
1585@@ -363,7 +377,7 @@ int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
1586         rc->qpbuf_pos--;
1587     }
1588     else
1589-        x264_adaptive_quant_frame( h, frame );
1590+        x264_adaptive_quant_frame( h, frame, quant_offsets );
1591     return 0;
1592 fail:
1593     x264_log(h, X264_LOG_ERROR, "Incomplete MB-tree stats file.\n");
1594diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
1595index e052b2a..dd139eb 100644
1596--- a/encoder/ratecontrol.h
1597+++ b/encoder/ratecontrol.h
1598@@ -29,8 +29,8 @@ void x264_ratecontrol_delete( x264_t * );
1599 
1600 void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
1601 
1602-void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
1603-int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
1604+void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets );
1605+int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets );
1606 int  x264_reference_build_list_optimal( x264_t *h );
1607 void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
1608 void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead );
1609diff --git a/x264.h b/x264.h
1610index 95efd88..a4b3400 100644
1611--- a/x264.h
1612+++ b/x264.h
1613@@ -35,7 +35,7 @@
1614 
1615 #include <stdarg.h>
1616 
1617-#define X264_BUILD 96
1618+#define X264_BUILD 97
1619 
1620 /* x264_t:
1621  *      opaque handler for encoder */
1622@@ -508,6 +508,22 @@ typedef struct
1623 
1624 typedef struct
1625 {
1626+    /* In: an array of quantizer offsets to be applied to this image during encoding.
1627+     *     These are added on top of the decisions made by x264.
1628+     *     Offsets can be fractional; they are added before QPs are rounded to integer.
1629+     *     Adaptive quantization must be enabled to use this feature.  Behavior if quant
1630+     *     offsets differ between encoding passes is undefined.
1631+     *
1632+     *     Array contains one offset per macroblock, in raster scan order.  In interlaced
1633+     *     mode, top-field MBs and bottom-field MBs are interleaved at the row level. */
1634+    float *quant_offsets;
1635+    /* In: optional callback to free quant_offsets when used.
1636+     *     Useful if one wants to use a different quant_offset array for each frame. */
1637+    void (*quant_offsets_free)( void* );
1638+} x264_image_properties_t;
1639+
1640+typedef struct
1641+{
1642     /* In: force picture type (if not auto)
1643      *     If x264 encoding parameters are violated in the forcing of picture types,
1644      *     x264 will correct the input picture type and log a warning.
1645@@ -537,6 +553,8 @@ typedef struct
1646     x264_param_t *param;
1647     /* In: raw data */
1648     x264_image_t img;
1649+    /* In: optional information to modify encoder decisions for this frame */
1650+    x264_image_properties_t prop;
1651     /* Out: HRD timing information. Output only when i_nal_hrd is set. */
1652     x264_hrd_t hrd_timing;
1653     /* private user data. libx264 doesn't touch this,
1654-- 
16551.7.0.4
1656
1657
1658From 1edf08c06c9f07fc8bb56879033dbc59e86ef7ac Mon Sep 17 00:00:00 2001
1659From: Henrik Gramner <hengar-6@student.ltu.se>
1660Date: Thu, 27 May 2010 22:18:38 +0200
1661Subject: [PATCH 7/8] Optimize out some x264_scan8 reads
1662
1663---
1664 encoder/analyse.c    |   15 ++++-----
1665 encoder/macroblock.c |   82 ++++++++++++++++++++++++++++++--------------------
1666 encoder/me.c         |   25 ++++++++-------
1667 3 files changed, 70 insertions(+), 52 deletions(-)
1668
1669diff --git a/encoder/analyse.c b/encoder/analyse.c
1670index a128a70..9e85e89 100644
1671--- a/encoder/analyse.c
1672+++ b/encoder/analyse.c
1673@@ -907,8 +907,6 @@ static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
1674 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1675 {
1676     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
1677-
1678-    int x, y;
1679     uint64_t i_satd, i_best;
1680     h->mb.i_skip_intra = 0;
1681 
1682@@ -1031,8 +1029,9 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1683             int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1684 
1685             i_best = COST_MAX64;
1686-            x = idx&1;
1687-            y = idx>>1;
1688+            int x = idx&1;
1689+            int y = idx>>1;
1690+            int s8 = X264_SCAN8_0 + 2*x + 16*y;
1691 
1692             p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1693             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1694@@ -1061,8 +1060,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1695                     if( !(idx&1) )
1696                         for( int j = 0; j < 7; j++ )
1697                             pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1698-                    i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1699-                    i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1700+                    i_nnz[0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8] );
1701+                    i_nnz[1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8] );
1702                 }
1703             }
1704             a->i_cbp_i8x8_luma = cbp_luma_new;
1705@@ -1070,8 +1069,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1706             if( !(idx&1) )
1707                 for( int j = 0; j < 7; j++ )
1708                     p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1709-            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1710-            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1711+            M16( &h->mb.cache.non_zero_count[s8 + 0*8] ) = i_nnz[0];
1712+            M16( &h->mb.cache.non_zero_count[s8 + 1*8] ) = i_nnz[1];
1713 
1714             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1715         }
1716diff --git a/encoder/macroblock.c b/encoder/macroblock.c
1717index 984f8a8..cdc4563 100644
1718--- a/encoder/macroblock.c
1719+++ b/encoder/macroblock.c
1720@@ -135,11 +135,12 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
1721     }
1722 }
1723 
1724-#define STORE_8x8_NNZ(idx,nz)\
1725+#define STORE_8x8_NNZ( s8, nz )\
1726+do\
1727 {\
1728-    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] ) = nz * 0x0101;\
1729-    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] ) = nz * 0x0101;\
1730-}
1731+    M16( &h->mb.cache.non_zero_count[(s8) + 0*8] ) = (nz) * 0x0101;\
1732+    M16( &h->mb.cache.non_zero_count[(s8) + 1*8] ) = (nz) * 0x0101;\
1733+} while(0)
1734 
1735 #define CLEAR_16x16_NNZ \
1736 {\
1737@@ -151,17 +152,18 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
1738 
1739 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
1740 {
1741-    int x = 8 * (idx&1);
1742-    int y = 8 * (idx>>1);
1743+    int x = idx&1;
1744+    int y = idx>>1;
1745+    int s8 = X264_SCAN8_0 + 2*x + 16*y;
1746     int nz;
1747-    uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
1748-    uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
1749+    uint8_t *p_src = &h->mb.pic.p_fenc[0][8*x + 8*y*FENC_STRIDE];
1750+    uint8_t *p_dst = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
1751     ALIGNED_ARRAY_16( int16_t, dct8x8,[64] );
1752 
1753     if( h->mb.b_lossless )
1754     {
1755         nz = h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
1756-        STORE_8x8_NNZ(idx,nz);
1757+        STORE_8x8_NNZ( s8, nz );
1758         h->mb.i_cbp_luma |= nz<<idx;
1759         return;
1760     }
1761@@ -175,10 +177,10 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
1762         h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
1763         h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
1764         h->dctf.add8x8_idct8( p_dst, dct8x8 );
1765-        STORE_8x8_NNZ(idx,1);
1766+        STORE_8x8_NNZ( s8, 1 );
1767     }
1768     else
1769-        STORE_8x8_NNZ(idx,0);
1770+        STORE_8x8_NNZ( s8, 0 );
1771 }
1772 
1773 static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
1774@@ -728,12 +730,13 @@ void x264_macroblock_encode( x264_t *h )
1775             if( h->mb.b_transform_8x8 )
1776                 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
1777                 {
1778-                    int x = 8*(i8x8&1);
1779-                    int y = 8*(i8x8>>1);
1780-                    nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
1781-                                        h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
1782-                                        h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
1783-                    STORE_8x8_NNZ(i8x8,nz);
1784+                    int x = i8x8&1;
1785+                    int y = i8x8>>1;
1786+                    int s8 = X264_SCAN8_0 + 2*x + 16*y;
1787+
1788+                    nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8], h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE,
1789+                                                                   h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE );
1790+                    STORE_8x8_NNZ( s8, nz );
1791                     h->mb.i_cbp_luma |= nz << i8x8;
1792                 }
1793             else
1794@@ -783,14 +786,18 @@ void x264_macroblock_encode( x264_t *h )
1795             {
1796                 for( int idx = 0; idx < 4; idx++ )
1797                 {
1798+                    int x = idx&1;
1799+                    int y = idx>>1;
1800+                    int s8 = X264_SCAN8_0 + 2*x + 16*y;
1801+
1802                     if( h->mb.i_cbp_luma&(1<<idx) )
1803                     {
1804                         h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
1805-                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
1806-                        STORE_8x8_NNZ(idx,1);
1807+                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
1808+                        STORE_8x8_NNZ( s8, 1 );
1809                     }
1810                     else
1811-                        STORE_8x8_NNZ(idx,0);
1812+                        STORE_8x8_NNZ( s8, 0 );
1813                 }
1814             }
1815         }
1816@@ -825,18 +832,24 @@ void x264_macroblock_encode( x264_t *h )
1817                     }
1818                 }
1819 
1820+                int x = i8x8&1;
1821+                int y = i8x8>>1;
1822+
1823                 /* decimate this 8x8 block */
1824                 i_decimate_mb += i_decimate_8x8;
1825                 if( b_decimate )
1826                 {
1827                     if( i_decimate_8x8 < 4 )
1828-                        STORE_8x8_NNZ(i8x8,0)
1829+                    {
1830+                        int s8 = X264_SCAN8_0 + 2*x + 16*y;
1831+                        STORE_8x8_NNZ( s8, 0 );
1832+                    }
1833                     else
1834                         h->mb.i_cbp_luma |= 1<<i8x8;
1835                 }
1836                 else if( cbp )
1837                 {
1838-                    h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
1839+                    h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
1840                     h->mb.i_cbp_luma |= 1<<i8x8;
1841                 }
1842             }
1843@@ -1045,8 +1058,11 @@ void x264_noise_reduction_update( x264_t *h )
1844 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
1845 {
1846     int i_qp = h->mb.i_qp;
1847-    uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
1848-    uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
1849+    int x = i8&1;
1850+    int y = i8>>1;
1851+    int s8 = X264_SCAN8_0 + 2*x + 16*y;
1852+    uint8_t *p_fenc = h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE;
1853+    uint8_t *p_fdec = h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE;
1854     int b_decimate = h->mb.b_dct_decimate;
1855     int nnz8x8 = 0;
1856     int nz;
1857@@ -1059,7 +1075,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
1858         if( h->mb.b_transform_8x8 )
1859         {
1860             nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
1861-            STORE_8x8_NNZ(i8,nnz8x8);
1862+            STORE_8x8_NNZ( s8, nnz8x8 );
1863         }
1864         else
1865         {
1866@@ -1075,8 +1091,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
1867         for( int ch = 0; ch < 2; ch++ )
1868         {
1869             int16_t dc;
1870-            p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
1871-            p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
1872+            p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
1873+            p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
1874             nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec, &dc );
1875             h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
1876         }
1877@@ -1099,13 +1115,13 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
1878                 {
1879                     h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
1880                     h->dctf.add8x8_idct8( p_fdec, dct8x8 );
1881-                    STORE_8x8_NNZ(i8,1);
1882+                    STORE_8x8_NNZ( s8, 1 );
1883                 }
1884                 else
1885-                    STORE_8x8_NNZ(i8,0);
1886+                    STORE_8x8_NNZ( s8, 0 );
1887             }
1888             else
1889-                STORE_8x8_NNZ(i8,0);
1890+                STORE_8x8_NNZ( s8, 0 );
1891         }
1892         else
1893         {
1894@@ -1132,7 +1148,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
1895             if( nnz8x8 )
1896                 h->dctf.add8x8_idct( p_fdec, dct4x4 );
1897             else
1898-                STORE_8x8_NNZ(i8,0);
1899+                STORE_8x8_NNZ( s8, 0 );
1900         }
1901 
1902         i_qp = h->mb.i_chroma_qp;
1903@@ -1140,8 +1156,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
1904         for( int ch = 0; ch < 2; ch++ )
1905         {
1906             ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
1907-            p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
1908-            p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
1909+            p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
1910+            p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
1911 
1912             h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
1913             dct4x4[0] = 0;
1914diff --git a/encoder/me.c b/encoder/me.c
1915index 77073cc..40d0650 100644
1916--- a/encoder/me.c
1917+++ b/encoder/me.c
1918@@ -937,8 +937,11 @@ int x264_iter_kludge = 0;
1919 
1920 static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
1921 {
1922-    int16_t *cache0_mv = h->mb.cache.mv[0][x264_scan8[i8*4]];
1923-    int16_t *cache1_mv = h->mb.cache.mv[1][x264_scan8[i8*4]];
1924+    int x = i8&1;
1925+    int y = i8>>1;
1926+    int s8 = X264_SCAN8_0 + 2*x + 16*y;
1927+    int16_t *cache0_mv = h->mb.cache.mv[0][s8];
1928+    int16_t *cache1_mv = h->mb.cache.mv[1][s8];
1929     const int i_pixel = m0->i_pixel;
1930     const int bw = x264_pixel_size[i_pixel].w;
1931     const int bh = x264_pixel_size[i_pixel].h;
1932@@ -946,11 +949,11 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
1933     ALIGNED_ARRAY_8( uint8_t, pixu_buf,[2],[9][8*8] );
1934     ALIGNED_ARRAY_8( uint8_t, pixv_buf,[2],[9][8*8] );
1935     uint8_t *src[2][9];
1936-    uint8_t *pix  = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8];
1937-    uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
1938-    uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
1939-    const int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]];
1940-    const int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]];
1941+    uint8_t *pix  = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
1942+    uint8_t *pixu = &h->mb.pic.p_fdec[1][4*x + 4*y*FDEC_STRIDE];
1943+    uint8_t *pixv = &h->mb.pic.p_fdec[2][4*x + 4*y*FDEC_STRIDE];
1944+    int ref0 = h->mb.cache.ref[0][s8];
1945+    int ref1 = h->mb.cache.ref[1][s8];
1946     const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1947     const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1948     int stride[2][9];
1949@@ -1058,13 +1061,13 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
1950 
1951     if( rd )
1952     {
1953-        x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
1954+        x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
1955         amvd = pack8to16( X264_MIN(abs(bm0x - m0->mvp[0]),33), X264_MIN(abs(bm0y - m0->mvp[1]),33) );
1956-        x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, amvd );
1957+        x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 0, amvd );
1958 
1959-        x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
1960+        x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
1961         amvd = pack8to16( X264_MIN(abs(bm1x - m1->mvp[0]),33), X264_MIN(abs(bm1y - m1->mvp[1]),33) );
1962-        x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, amvd );
1963+        x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 1, amvd );
1964     }
1965 
1966     m0->mv[0] = bm0x;
1967-- 
19681.7.0.4
1969
1970
1971From cb8b597efd407a6deecee00b81483d82c77abadc Mon Sep 17 00:00:00 2001
1972From: Jason Garrett-Glaser <darkshikari@gmail.com>
1973Date: Sun, 30 May 2010 09:42:53 -0700
1974Subject: [PATCH 8/8] Fix ultrafast to actually turn off weightb
1975
1976---
1977 common/common.c |    1 +
1978 1 files changed, 1 insertions(+), 0 deletions(-)
1979
1980diff --git a/common/common.c b/common/common.c
1981index c092c01..48e1bbc 100644
1982--- a/common/common.c
1983+++ b/common/common.c
1984@@ -183,6 +183,7 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
1985         param->i_bframe_adaptive = X264_B_ADAPT_NONE;
1986         param->rc.b_mb_tree = 0;
1987         param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
1988+        param->analyse.b_weighted_bipred = 0;
1989     }
1990     else if( !strcasecmp( preset, "superfast" ) )
1991     {
1992-- 
19931.7.0.4