· 8 years ago · May 28, 2017, 06:24 PM
1From 2bcbac357b714f468e0138f022e584ffdb42f6d2 Mon Sep 17 00:00:00 2001
2From: Jason Garrett-Glaser <darkshikari@gmail.com>
3Date: Mon, 31 May 2010 11:14:22 -0700
4Subject: [PATCH 01/11] Fix cavlc+deblock+8x8dct (regression in r1612)
5 Add cavlc+8x8dct munging to new deblock system.
6 May have caused minor visual artifacts.
7
8---
9 common/deblock.c | 47 -----------------------------------------------
10 common/macroblock.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
11 2 files changed, 44 insertions(+), 49 deletions(-)
12
13diff --git a/common/deblock.c b/common/deblock.c
14index fc039c5..27c73ae 100644
15--- a/common/deblock.c
16+++ b/common/deblock.c
17@@ -24,46 +24,6 @@
18
19 #include "common.h"
20
21-/* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
22- * entropy coding, but per 64 coeffs for the purpose of deblocking */
23-static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
24-{
25- uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
26- int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
27- for( int x = 0; x<h->sps->i_mb_width; x++ )
28- {
29- memcpy( buf+x, src+x, 16 );
30- if( transform[x] )
31- {
32- int nnz = src[x][0] | src[x][1];
33- src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
34- nnz = src[x][2] | src[x][3];
35- src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
36- }
37- }
38-}
39-
40-static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
41-{
42- uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
43- for( int x = 0; x < h->sps->i_mb_width; x++ )
44- memcpy( dst+x, buf+x, 16 );
45-}
46-
47-static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
48-{
49- func( h, mb_y, buf );
50- if( mb_y > 0 )
51- func( h, mb_y-1, buf + h->sps->i_mb_width );
52- if( h->sh.b_mbaff )
53- {
54- func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
55- if( mb_y > 0 )
56- func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
57- }
58-}
59-
60-
61 /* Deblocking filter */
62 static const uint8_t i_alpha_table[52+12*2] =
63 {
64@@ -344,10 +304,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
65 int stride2y = stridey << b_interlaced;
66 int strideuv = h->fdec->i_stride[1];
67 int stride2uv = strideuv << b_interlaced;
68- uint8_t (*nnz_backup)[16] = h->scratch_buffer;
69-
70- if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
71- munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
72
73 for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
74 {
75@@ -427,9 +383,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
76 if( !transform_8x8 ) FILTER( , 1, 3, qp, qpc );
77 }
78 }
79-
80- if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
81- munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
82 }
83
84 #ifdef HAVE_MMX
85diff --git a/common/macroblock.c b/common/macroblock.c
86index ce510e9..01c90d2 100644
87--- a/common/macroblock.c
88+++ b/common/macroblock.c
89@@ -344,8 +344,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
90 int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
91 int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
92 ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
93- int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
94- scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_nnz );
95+ scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
96 }
97 int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
98 scratch_size = X264_MAX( scratch_size, buf_mbtree );
99@@ -1013,6 +1012,49 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
100 M32( &h->mb.cache.ref[0][x264_scan8[0]+8*2] ) = refbot;
101 M32( &h->mb.cache.ref[0][x264_scan8[0]+8*3] ) = refbot;
102 }
103+
104+ /* Munge NNZ for cavlc + 8x8dct */
105+ if( !h->param.b_cabac && h->pps->b_transform_8x8_mode )
106+ {
107+ uint8_t (*nnz)[24] = h->mb.non_zero_count;
108+ int top = h->mb.i_mb_top_xy;
109+ int left = h->mb.i_mb_left_xy;
110+
111+ if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] )
112+ {
113+ int i8 = x264_scan8[0] - 8;
114+ int nnz_top0 = M16( &nnz[top][8] ) | M16( &nnz[top][12] );
115+ int nnz_top1 = M16( &nnz[top][10] ) | M16( &nnz[top][14] );
116+ M16( &h->mb.cache.non_zero_count[i8+0] ) = nnz_top0 ? 0x0101 : 0;
117+ M16( &h->mb.cache.non_zero_count[i8+2] ) = nnz_top1 ? 0x0101 : 0;
118+ }
119+
120+ if( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left] )
121+ {
122+ int i8 = x264_scan8[0] - 1;
123+ int nnz_left0 = M16( &nnz[left][2] ) | M16( &nnz[left][6] );
124+ int nnz_left1 = M16( &nnz[left][10] ) | M16( &nnz[left][14] );
125+ h->mb.cache.non_zero_count[i8+8*0] = !!nnz_left0;
126+ h->mb.cache.non_zero_count[i8+8*1] = !!nnz_left0;
127+ h->mb.cache.non_zero_count[i8+8*2] = !!nnz_left1;
128+ h->mb.cache.non_zero_count[i8+8*3] = !!nnz_left1;
129+ }
130+
131+ if( h->mb.mb_transform_size[h->mb.i_mb_xy] )
132+ {
133+ int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
134+ int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] );
135+ int nnz2 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[10]] );
136+ int nnz3 = M16( &h->mb.cache.non_zero_count[x264_scan8[12]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[14]] );
137+ uint32_t nnztop = pack16to32( !!nnz0, !!nnz1 ) * 0x0101;
138+ uint32_t nnzbot = pack16to32( !!nnz2, !!nnz3 ) * 0x0101;
139+
140+ M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*0] ) = nnztop;
141+ M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*1] ) = nnztop;
142+ M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*2] ) = nnzbot;
143+ M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*3] ) = nnzbot;
144+ }
145+ }
146 }
147
148 static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i )
149--
1501.7.0.4
151
152
153From d51fde592507649e22757a23f0ea0252ec35b5b6 Mon Sep 17 00:00:00 2001
154From: Anton Mitrofanov <BugMaster@narod.ru>
155Date: Mon, 31 May 2010 22:36:50 +0400
156Subject: [PATCH 02/11] Fix crash with MP4-muxing if zero frames were encoded
157
158---
159 output/mp4.c | 3 ++-
160 1 files changed, 2 insertions(+), 1 deletions(-)
161
162diff --git a/output/mp4.c b/output/mp4.c
163index f76541e..0aa5070 100644
164--- a/output/mp4.c
165+++ b/output/mp4.c
166@@ -112,6 +112,7 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
167 if( p_mp4->p_sample->data )
168 free( p_mp4->p_sample->data );
169
170+ p_mp4->p_sample->dataLength = 0;
171 gf_isom_sample_del( &p_mp4->p_sample );
172 }
173
174@@ -135,7 +136,7 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
175 * The reason is that an Edit Box maps the presentation time-line to the media time-line.
176 * Any demuxers should follow the Edit Box if it exists. */
177 GF_ISOSample *sample = gf_isom_get_sample_info( p_mp4->p_file, p_mp4->i_track, 1, NULL, NULL );
178- if( sample->CTS_Offset > 0 )
179+ if( sample && sample->CTS_Offset > 0 )
180 {
181 uint32_t mvhd_timescale = gf_isom_get_timescale( p_mp4->p_file );
182 uint64_t tkhd_duration = (uint64_t)( mdhd_duration * ( (double)mvhd_timescale / p_mp4->i_time_res ) );
183--
1841.7.0.4
185
186
187From 8098997dcba2602b22b43fdf26621d08d3f81333 Mon Sep 17 00:00:00 2001
188From: Jason Garrett-Glaser <darkshikari@gmail.com>
189Date: Sun, 30 May 2010 09:42:53 -0700
190Subject: [PATCH 03/11] Fix ultrafast to actually turn off weightb
191
192---
193 common/common.c | 1 +
194 1 files changed, 1 insertions(+), 0 deletions(-)
195
196diff --git a/common/common.c b/common/common.c
197index 62bef99..fccf2b0 100644
198--- a/common/common.c
199+++ b/common/common.c
200@@ -183,6 +183,7 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
201 param->i_bframe_adaptive = X264_B_ADAPT_NONE;
202 param->rc.b_mb_tree = 0;
203 param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
204+ param->analyse.b_weighted_bipred = 0;
205 }
206 else if( !strcasecmp( preset, "superfast" ) )
207 {
208--
2091.7.0.4
210
211
212From a7f870990af39a11f3bb883b9335baad91909ccb Mon Sep 17 00:00:00 2001
213From: Jason Garrett-Glaser <darkshikari@gmail.com>
214Date: Thu, 27 May 2010 12:31:41 -0700
215Subject: [PATCH 04/11] Fix omission in libx264 tuning documentation
216
217---
218 x264.h | 2 +-
219 1 files changed, 1 insertions(+), 1 deletions(-)
220
221diff --git a/x264.h b/x264.h
222index 6d7b703..95efd88 100644
223--- a/x264.h
224+++ b/x264.h
225@@ -446,7 +446,7 @@ static const char * const x264_tune_names[] = { "film", "animation", "grain", "s
226
227 /* Multiple tunings can be used if separated by a delimiter in ",./-+",
228 * however multiple psy tunings cannot be used.
229- * film, animation, grain, psnr, and ssim are psy tunings.
230+ * film, animation, grain, stillimage, psnr, and ssim are psy tunings.
231 *
232 * returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
233 int x264_param_default_preset( x264_param_t *, const char *preset, const char *tune );
234--
2351.7.0.4
236
237
238From 5832bdfaed3bcce1b2823b6594386e0357d8ff31 Mon Sep 17 00:00:00 2001
239From: Jason Garrett-Glaser <darkshikari@gmail.com>
240Date: Wed, 26 May 2010 12:55:35 -0700
241Subject: [PATCH 05/11] Merge some of adaptive quant and weightp
242 Eliminate redundant work; both of them were calculating variance of the frame.
243
244---
245 common/frame.h | 4 +-
246 encoder/analyse.h | 1 -
247 encoder/encoder.c | 12 ++---
248 encoder/ratecontrol.c | 124 +++++++++++++++++++++++++++++++-----------------
249 encoder/slicetype.c | 31 ++----------
250 5 files changed, 92 insertions(+), 80 deletions(-)
251
252diff --git a/common/frame.h b/common/frame.h
253index 91d27b5..ca5cb7a 100644
254--- a/common/frame.h
255+++ b/common/frame.h
256@@ -118,8 +118,8 @@ typedef struct x264_frame
257 uint16_t *i_inv_qscale_factor;
258 int b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
259 float f_weighted_cost_delta[X264_BFRAME_MAX+2];
260- uint32_t i_pixel_sum;
261- uint64_t i_pixel_ssd;
262+ uint32_t i_pixel_sum[3];
263+ uint64_t i_pixel_ssd[3];
264
265 /* hrd */
266 x264_hrd_t hrd_timing;
267diff --git a/encoder/analyse.h b/encoder/analyse.h
268index 7c2c22c..53e4c2e 100644
269--- a/encoder/analyse.h
270+++ b/encoder/analyse.h
271@@ -33,7 +33,6 @@ void x264_slicetype_decide( x264_t *h );
272 void x264_slicetype_analyse( x264_t *h, int keyframe );
273
274 int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
275-void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame );
276
277 int x264_lookahead_init( x264_t *h, int i_slicetype_length );
278 int x264_lookahead_is_empty( x264_t *h );
279diff --git a/encoder/encoder.c b/encoder/encoder.c
280index 52017ff..6e0dc54 100644
281--- a/encoder/encoder.c
282+++ b/encoder/encoder.c
283@@ -2246,21 +2246,17 @@ int x264_encoder_encode( x264_t *h,
284 fenc->i_pic_struct = PIC_STRUCT_PROGRESSIVE;
285 }
286
287- if( h->frames.b_have_lowres )
288- {
289- if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
290- x264_weight_plane_analyse( h, fenc );
291- x264_frame_init_lowres( h, fenc );
292- }
293-
294 if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
295 {
296 if( x264_macroblock_tree_read( h, fenc ) )
297 return -1;
298 }
299- else if( h->param.rc.i_aq_mode )
300+ else
301 x264_adaptive_quant_frame( h, fenc );
302
303+ if( h->frames.b_have_lowres )
304+ x264_frame_init_lowres( h, fenc );
305+
306 /* 2: Place the frame into the queue for its slice type decision */
307 x264_lookahead_put_frame( h, fenc );
308
309diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
310index a725a24..bf0a400 100644
311--- a/encoder/ratecontrol.c
312+++ b/encoder/ratecontrol.c
313@@ -215,12 +215,14 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
314 stride <<= h->mb.b_interlaced;
315 uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride );
316 uint32_t sum = (uint32_t)res;
317- uint32_t sqr = res >> 32;
318- return sqr - (sum * sum >> shift);
319+ uint32_t ssd = res >> 32;
320+ frame->i_pixel_sum[i] += sum;
321+ frame->i_pixel_ssd[i] += ssd;
322+ return ssd - (sum * sum >> shift);
323 }
324
325 // Find the total AC energy of the block in all planes.
326-static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
327+static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
328 {
329 /* This function contains annoying hacks because GCC has a habit of reordering emms
330 * and putting it after floating point ops. As a result, we put the emms at the end of the
331@@ -239,56 +241,90 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
332 * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
333 float strength;
334 float avg_adj = 0.f;
335- /* Need to init it anyways for MB tree. */
336- if( h->param.rc.f_aq_strength == 0 )
337- {
338- memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
339- memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
340- if( h->frames.b_have_lowres )
341- for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
342- frame->i_inv_qscale_factor[mb_xy] = 256;
343- return;
344+ int width = h->sps->i_mb_width;
345+ int height = h->sps->i_mb_height;
346+ /* Initialize frame stats */
347+ for( int i = 0; i < 3; i++ )
348+ {
349+ frame->i_pixel_sum[i] = 0;
350+ frame->i_pixel_ssd[i] = 0;
351 }
352
353- if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
354+ /* Degenerate cases */
355+ if( h->param.rc.i_aq_mode == X264_AQ_NONE || h->param.rc.f_aq_strength == 0 )
356 {
357- float avg_adj_pow2 = 0.f;
358- for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
359- for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
360- {
361- uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
362- float qp_adj = powf( energy + 1, 0.125f );
363- frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
364- avg_adj += qp_adj;
365- avg_adj_pow2 += qp_adj * qp_adj;
366- }
367- avg_adj /= h->mb.i_mb_count;
368- avg_adj_pow2 /= h->mb.i_mb_count;
369- strength = h->param.rc.f_aq_strength * avg_adj;
370- avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
371+ /* Need to init it anyways for MB tree */
372+ if( h->param.rc.f_aq_strength == 0 )
373+ {
374+ memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
375+ memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
376+ if( h->frames.b_have_lowres )
377+ for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
378+ frame->i_inv_qscale_factor[mb_xy] = 256;
379+ }
380+ /* Need variance data for weighted prediction */
381+ if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
382+ {
383+ for( int mb_y = 0; mb_y < height; mb_y++ )
384+ for( int mb_x = 0; mb_x < width; mb_x++ )
385+ x264_ac_energy_mb( h, mb_x, mb_y, frame );
386+ }
387+ else
388+ return;
389 }
390+ /* Actual adaptive quantization */
391 else
392- strength = h->param.rc.f_aq_strength * 1.0397f;
393-
394- for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
395- for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
396+ {
397+ if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
398 {
399- float qp_adj;
400- if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
401- {
402- qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
403- qp_adj = strength * (qp_adj - avg_adj);
404- }
405- else
406+ float avg_adj_pow2 = 0.f;
407+ for( int mb_y = 0; mb_y < height; mb_y++ )
408+ for( int mb_x = 0; mb_x < width; mb_x++ )
409+ {
410+ uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
411+ float qp_adj = powf( energy + 1, 0.125f );
412+ frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
413+ avg_adj += qp_adj;
414+ avg_adj_pow2 += qp_adj * qp_adj;
415+ }
416+ avg_adj /= h->mb.i_mb_count;
417+ avg_adj_pow2 /= h->mb.i_mb_count;
418+ strength = h->param.rc.f_aq_strength * avg_adj;
419+ avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
420+ }
421+ else
422+ strength = h->param.rc.f_aq_strength * 1.0397f;
423+
424+ for( int mb_y = 0; mb_y < height; mb_y++ )
425+ for( int mb_x = 0; mb_x < width; mb_x++ )
426 {
427- uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
428- qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
429+ float qp_adj;
430+ if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
431+ {
432+ qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
433+ qp_adj = strength * (qp_adj - avg_adj);
434+ }
435+ else
436+ {
437+ uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
438+ qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
439+ }
440+ frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
441+ frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
442+ if( h->frames.b_have_lowres )
443+ frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
444 }
445- frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
446- frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
447- if( h->frames.b_have_lowres )
448- frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
449- }
450+ }
451+
452+ /* Remove mean from SSD calculation */
453+ for( int i = 0; i < 3; i++ )
454+ {
455+ uint64_t ssd = frame->i_pixel_ssd[i];
456+ uint64_t sum = frame->i_pixel_sum[i];
457+ int w = width*16>>!!i;
458+ int h = height*16>>!!i;
459+ frame->i_pixel_ssd[i] = ssd - (sum * sum + w * h / 2) / (w * h);
460+ }
461 }
462
463 int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
464diff --git a/encoder/slicetype.c b/encoder/slicetype.c
465index 9352367..e454e12 100644
466--- a/encoder/slicetype.c
467+++ b/encoder/slicetype.c
468@@ -67,25 +67,6 @@ static void x264_weight_get_h264( unsigned int weight_nonh264, int offset, x264_
469 w->i_scale = X264_MIN( w->i_scale, 127 );
470 }
471
472-void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame )
473-{
474- uint32_t sad = 0;
475- uint64_t ssd = 0;
476- uint8_t *p = frame->plane[0];
477- int stride = frame->i_stride[0];
478- int width = frame->i_width[0];
479- int height = frame->i_lines[0];
480- for( int y = 0; y < height>>4; y++, p += stride*16 )
481- for( int x = 0; x < width; x += 16 )
482- {
483- uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride );
484- sad += (uint32_t)res;
485- ssd += res >> 32;
486- }
487- frame->i_pixel_sum = sad;
488- frame->i_pixel_ssd = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
489-}
490-
491 static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, uint8_t *dest )
492 {
493 int ref0_distance = fenc->i_frame - ref->i_frame - 1;
494@@ -167,10 +148,10 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
495 int found;
496 x264_weight_t *weights = fenc->weight[0];
497
498- fenc_var = round( sqrt( fenc->i_pixel_ssd ) );
499- ref_var = round( sqrt( ref->i_pixel_ssd ) );
500- fenc_mean = (float)fenc->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
501- ref_mean = (float) ref->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
502+ fenc_var = round( sqrt( fenc->i_pixel_ssd[0] ) );
503+ ref_var = round( sqrt( ref->i_pixel_ssd[0] ) );
504+ fenc_mean = (float)fenc->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
505+ ref_mean = (float) ref->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
506
507 //early termination
508 if( fabs( ref_mean - fenc_mean ) < 0.5 && fabs( 1 - fenc_var / ref_var ) < epsilon )
509@@ -534,8 +515,8 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
510 do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
511 if( do_search[0] )
512 {
513- if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART
514- || h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
515+ if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ||
516+ h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
517 {
518 x264_emms();
519 x264_weights_analyse( h, frames[b], frames[p0], 1 );
520--
5211.7.0.4
522
523
524From 794713a35eadcd999d5aab4a50274ca43f29be93 Mon Sep 17 00:00:00 2001
525From: Jason Garrett-Glaser <darkshikari@gmail.com>
526Date: Thu, 27 May 2010 10:42:15 -0700
527Subject: [PATCH 06/11] Add fast skip in lookahead motion search
528 Helps speed very significantly on motionless blocks.
529
530---
531 encoder/slicetype.c | 16 +++++++++++++++-
532 1 files changed, 15 insertions(+), 1 deletions(-)
533
534diff --git a/encoder/slicetype.c b/encoder/slicetype.c
535index e454e12..d7cfe5c 100644
536--- a/encoder/slicetype.c
537+++ b/encoder/slicetype.c
538@@ -379,11 +379,25 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
539 CP32( m[l].mvp, mvc[0] );
540 else
541 x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );
542- x264_me_search( h, &m[l], mvc, i_mvc );
543
544+ /* Fast skip for cases of near-zero residual. Shortcut: don't bother except in the mv0 case,
545+ * since anything else is likely to have enough residual to not trigger the skip. */
546+ if( !M32( m[l].mvp ) )
547+ {
548+ m[l].cost = h->pixf.mbcmp[PIXEL_8x8]( m[l].p_fenc[0], FENC_STRIDE, m[l].p_fref[0], m[l].i_stride[0] );
549+ if( m[l].cost < 64 )
550+ {
551+ M32( m[l].mv ) = 0;
552+ goto skip_motionest;
553+ }
554+ }
555+
556+ x264_me_search( h, &m[l], mvc, i_mvc );
557 m[l].cost -= 2; // remove mvcost from skip mbs
558 if( M32( m[l].mv ) )
559 m[l].cost += 5;
560+
561+skip_motionest:
562 CP32( fenc_mvs[l], m[l].mv );
563 *fenc_costs[l] = m[l].cost;
564 }
565--
5661.7.0.4
567
568
569From 77b568b22d42baa344dad050aef420de3b22e126 Mon Sep 17 00:00:00 2001
570From: Henrik Gramner <hengar-6@student.ltu.se>
571Date: Thu, 27 May 2010 22:18:38 +0200
572Subject: [PATCH 07/11] Optimize out some x264_scan8 reads
573
574---
575 encoder/analyse.c | 15 ++++-----
576 encoder/macroblock.c | 82 ++++++++++++++++++++++++++++++--------------------
577 encoder/me.c | 25 ++++++++-------
578 3 files changed, 70 insertions(+), 52 deletions(-)
579
580diff --git a/encoder/analyse.c b/encoder/analyse.c
581index a128a70..9e85e89 100644
582--- a/encoder/analyse.c
583+++ b/encoder/analyse.c
584@@ -907,8 +907,6 @@ static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
585 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
586 {
587 uint8_t *p_dst = h->mb.pic.p_fdec[0];
588-
589- int x, y;
590 uint64_t i_satd, i_best;
591 h->mb.i_skip_intra = 0;
592
593@@ -1031,8 +1029,9 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
594 int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
595
596 i_best = COST_MAX64;
597- x = idx&1;
598- y = idx>>1;
599+ int x = idx&1;
600+ int y = idx>>1;
601+ int s8 = X264_SCAN8_0 + 2*x + 16*y;
602
603 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
604 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
605@@ -1061,8 +1060,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
606 if( !(idx&1) )
607 for( int j = 0; j < 7; j++ )
608 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
609- i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
610- i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
611+ i_nnz[0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8] );
612+ i_nnz[1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8] );
613 }
614 }
615 a->i_cbp_i8x8_luma = cbp_luma_new;
616@@ -1070,8 +1069,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
617 if( !(idx&1) )
618 for( int j = 0; j < 7; j++ )
619 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
620- M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
621- M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
622+ M16( &h->mb.cache.non_zero_count[s8 + 0*8] ) = i_nnz[0];
623+ M16( &h->mb.cache.non_zero_count[s8 + 1*8] ) = i_nnz[1];
624
625 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
626 }
627diff --git a/encoder/macroblock.c b/encoder/macroblock.c
628index 984f8a8..cdc4563 100644
629--- a/encoder/macroblock.c
630+++ b/encoder/macroblock.c
631@@ -135,11 +135,12 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
632 }
633 }
634
635-#define STORE_8x8_NNZ(idx,nz)\
636+#define STORE_8x8_NNZ( s8, nz )\
637+do\
638 {\
639- M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] ) = nz * 0x0101;\
640- M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] ) = nz * 0x0101;\
641-}
642+ M16( &h->mb.cache.non_zero_count[(s8) + 0*8] ) = (nz) * 0x0101;\
643+ M16( &h->mb.cache.non_zero_count[(s8) + 1*8] ) = (nz) * 0x0101;\
644+} while(0)
645
646 #define CLEAR_16x16_NNZ \
647 {\
648@@ -151,17 +152,18 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
649
650 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
651 {
652- int x = 8 * (idx&1);
653- int y = 8 * (idx>>1);
654+ int x = idx&1;
655+ int y = idx>>1;
656+ int s8 = X264_SCAN8_0 + 2*x + 16*y;
657 int nz;
658- uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
659- uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
660+ uint8_t *p_src = &h->mb.pic.p_fenc[0][8*x + 8*y*FENC_STRIDE];
661+ uint8_t *p_dst = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
662 ALIGNED_ARRAY_16( int16_t, dct8x8,[64] );
663
664 if( h->mb.b_lossless )
665 {
666 nz = h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
667- STORE_8x8_NNZ(idx,nz);
668+ STORE_8x8_NNZ( s8, nz );
669 h->mb.i_cbp_luma |= nz<<idx;
670 return;
671 }
672@@ -175,10 +177,10 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
673 h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
674 h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
675 h->dctf.add8x8_idct8( p_dst, dct8x8 );
676- STORE_8x8_NNZ(idx,1);
677+ STORE_8x8_NNZ( s8, 1 );
678 }
679 else
680- STORE_8x8_NNZ(idx,0);
681+ STORE_8x8_NNZ( s8, 0 );
682 }
683
684 static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
685@@ -728,12 +730,13 @@ void x264_macroblock_encode( x264_t *h )
686 if( h->mb.b_transform_8x8 )
687 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
688 {
689- int x = 8*(i8x8&1);
690- int y = 8*(i8x8>>1);
691- nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
692- h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
693- h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
694- STORE_8x8_NNZ(i8x8,nz);
695+ int x = i8x8&1;
696+ int y = i8x8>>1;
697+ int s8 = X264_SCAN8_0 + 2*x + 16*y;
698+
699+ nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8], h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE,
700+ h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE );
701+ STORE_8x8_NNZ( s8, nz );
702 h->mb.i_cbp_luma |= nz << i8x8;
703 }
704 else
705@@ -783,14 +786,18 @@ void x264_macroblock_encode( x264_t *h )
706 {
707 for( int idx = 0; idx < 4; idx++ )
708 {
709+ int x = idx&1;
710+ int y = idx>>1;
711+ int s8 = X264_SCAN8_0 + 2*x + 16*y;
712+
713 if( h->mb.i_cbp_luma&(1<<idx) )
714 {
715 h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
716- h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
717- STORE_8x8_NNZ(idx,1);
718+ h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
719+ STORE_8x8_NNZ( s8, 1 );
720 }
721 else
722- STORE_8x8_NNZ(idx,0);
723+ STORE_8x8_NNZ( s8, 0 );
724 }
725 }
726 }
727@@ -825,18 +832,24 @@ void x264_macroblock_encode( x264_t *h )
728 }
729 }
730
731+ int x = i8x8&1;
732+ int y = i8x8>>1;
733+
734 /* decimate this 8x8 block */
735 i_decimate_mb += i_decimate_8x8;
736 if( b_decimate )
737 {
738 if( i_decimate_8x8 < 4 )
739- STORE_8x8_NNZ(i8x8,0)
740+ {
741+ int s8 = X264_SCAN8_0 + 2*x + 16*y;
742+ STORE_8x8_NNZ( s8, 0 );
743+ }
744 else
745 h->mb.i_cbp_luma |= 1<<i8x8;
746 }
747 else if( cbp )
748 {
749- h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
750+ h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
751 h->mb.i_cbp_luma |= 1<<i8x8;
752 }
753 }
754@@ -1045,8 +1058,11 @@ void x264_noise_reduction_update( x264_t *h )
755 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
756 {
757 int i_qp = h->mb.i_qp;
758- uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
759- uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
760+ int x = i8&1;
761+ int y = i8>>1;
762+ int s8 = X264_SCAN8_0 + 2*x + 16*y;
763+ uint8_t *p_fenc = h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE;
764+ uint8_t *p_fdec = h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE;
765 int b_decimate = h->mb.b_dct_decimate;
766 int nnz8x8 = 0;
767 int nz;
768@@ -1059,7 +1075,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
769 if( h->mb.b_transform_8x8 )
770 {
771 nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
772- STORE_8x8_NNZ(i8,nnz8x8);
773+ STORE_8x8_NNZ( s8, nnz8x8 );
774 }
775 else
776 {
777@@ -1075,8 +1091,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
778 for( int ch = 0; ch < 2; ch++ )
779 {
780 int16_t dc;
781- p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
782- p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
783+ p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
784+ p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
785 nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec, &dc );
786 h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
787 }
788@@ -1099,13 +1115,13 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
789 {
790 h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
791 h->dctf.add8x8_idct8( p_fdec, dct8x8 );
792- STORE_8x8_NNZ(i8,1);
793+ STORE_8x8_NNZ( s8, 1 );
794 }
795 else
796- STORE_8x8_NNZ(i8,0);
797+ STORE_8x8_NNZ( s8, 0 );
798 }
799 else
800- STORE_8x8_NNZ(i8,0);
801+ STORE_8x8_NNZ( s8, 0 );
802 }
803 else
804 {
805@@ -1132,7 +1148,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
806 if( nnz8x8 )
807 h->dctf.add8x8_idct( p_fdec, dct4x4 );
808 else
809- STORE_8x8_NNZ(i8,0);
810+ STORE_8x8_NNZ( s8, 0 );
811 }
812
813 i_qp = h->mb.i_chroma_qp;
814@@ -1140,8 +1156,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
815 for( int ch = 0; ch < 2; ch++ )
816 {
817 ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
818- p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
819- p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
820+ p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
821+ p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
822
823 h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
824 dct4x4[0] = 0;
825diff --git a/encoder/me.c b/encoder/me.c
826index 77073cc..40d0650 100644
827--- a/encoder/me.c
828+++ b/encoder/me.c
829@@ -937,8 +937,11 @@ int x264_iter_kludge = 0;
830
831 static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
832 {
833- int16_t *cache0_mv = h->mb.cache.mv[0][x264_scan8[i8*4]];
834- int16_t *cache1_mv = h->mb.cache.mv[1][x264_scan8[i8*4]];
835+ int x = i8&1;
836+ int y = i8>>1;
837+ int s8 = X264_SCAN8_0 + 2*x + 16*y;
838+ int16_t *cache0_mv = h->mb.cache.mv[0][s8];
839+ int16_t *cache1_mv = h->mb.cache.mv[1][s8];
840 const int i_pixel = m0->i_pixel;
841 const int bw = x264_pixel_size[i_pixel].w;
842 const int bh = x264_pixel_size[i_pixel].h;
843@@ -946,11 +949,11 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
844 ALIGNED_ARRAY_8( uint8_t, pixu_buf,[2],[9][8*8] );
845 ALIGNED_ARRAY_8( uint8_t, pixv_buf,[2],[9][8*8] );
846 uint8_t *src[2][9];
847- uint8_t *pix = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8];
848- uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
849- uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
850- const int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]];
851- const int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]];
852+ uint8_t *pix = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
853+ uint8_t *pixu = &h->mb.pic.p_fdec[1][4*x + 4*y*FDEC_STRIDE];
854+ uint8_t *pixv = &h->mb.pic.p_fdec[2][4*x + 4*y*FDEC_STRIDE];
855+ int ref0 = h->mb.cache.ref[0][s8];
856+ int ref1 = h->mb.cache.ref[1][s8];
857 const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
858 const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
859 int stride[2][9];
860@@ -1058,13 +1061,13 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
861
862 if( rd )
863 {
864- x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
865+ x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
866 amvd = pack8to16( X264_MIN(abs(bm0x - m0->mvp[0]),33), X264_MIN(abs(bm0y - m0->mvp[1]),33) );
867- x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, amvd );
868+ x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 0, amvd );
869
870- x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
871+ x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
872 amvd = pack8to16( X264_MIN(abs(bm1x - m1->mvp[0]),33), X264_MIN(abs(bm1y - m1->mvp[1]),33) );
873- x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, amvd );
874+ x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 1, amvd );
875 }
876
877 m0->mv[0] = bm0x;
878--
8791.7.0.4
880
881
882From 0c7cf0bfb1d30ee8e7f1b355fef5aa9e2db929d2 Mon Sep 17 00:00:00 2001
883From: Henrik Gramner <hengar-6@student.ltu.se>
884Date: Sun, 30 May 2010 22:45:14 +0200
885Subject: [PATCH 08/11] Some deblocking-related optimizations
886
887---
888 common/deblock.c | 8 ++++----
889 common/macroblock.c | 43 +++++++++++++++++++++++--------------------
890 2 files changed, 27 insertions(+), 24 deletions(-)
891
892diff --git a/common/deblock.c b/common/deblock.c
893index 27c73ae..3296dbf 100644
894--- a/common/deblock.c
895+++ b/common/deblock.c
896@@ -299,7 +299,7 @@ static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2,
897 void x264_frame_deblock_row( x264_t *h, int mb_y )
898 {
899 int b_interlaced = h->sh.b_mbaff;
900- int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
901+ int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->param.analyse.i_chroma_qp_offset );
902 int stridey = h->fdec->i_stride[0];
903 int stride2y = stridey << b_interlaced;
904 int strideuv = h->fdec->i_stride[1];
905@@ -318,7 +318,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
906 uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
907 uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
908 uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
909- if( b_interlaced && (mb_y&1) )
910+ if( mb_y & b_interlaced )
911 {
912 pixy -= 15*stridey;
913 pixu -= 7*strideuv;
914@@ -366,12 +366,12 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
915 int qp_top = (qp + qpt + 1) >> 1;
916 int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1;
917 int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] );
918- if( !b_interlaced && (intra_cur || intra_top) )
919+ if( ~b_interlaced & (intra_cur | intra_top) )
920 FILTER( _intra, 1, 0, qp_top, qpc_top );
921 else
922 {
923 if( intra_top )
924- memset( bs[1][0], 3, sizeof(bs[1][0]) );
925+ M32( bs[1][0] ) = 0x03030303;
926 FILTER( , 1, 0, qp_top, qpc_top );
927 }
928 }
929diff --git a/common/macroblock.c b/common/macroblock.c
930index 01c90d2..26f63f5 100644
931--- a/common/macroblock.c
932+++ b/common/macroblock.c
933@@ -400,9 +400,27 @@ void x264_macroblock_slice_init( x264_t *h )
934 }
935 }
936 }
937- if( h->sh.i_type == SLICE_TYPE_P )
938+ else if( h->sh.i_type == SLICE_TYPE_P )
939+ {
940 memset( h->mb.cache.skip, 0, sizeof( h->mb.cache.skip ) );
941
942+ if( h->sh.i_disable_deblocking_filter_idc != 1 && h->param.analyse.i_weighted_pred )
943+ {
944+ deblock_ref_table(-2) = -2;
945+ deblock_ref_table(-1) = -1;
946+ for( int i = 0; i < h->i_ref0 << h->sh.b_mbaff; i++ )
947+ {
948+ /* Mask off high bits to avoid frame num collisions with -1/-2.
949+ * In current x264 frame num values don't cover a range of more
950+ * than 32, so 6 bits is enough for uniqueness. */
951+ if( !h->mb.b_interlaced )
952+ deblock_ref_table(i) = h->fref0[i]->i_frame_num&63;
953+ else
954+ deblock_ref_table(i) = ((h->fref0[i>>1]->i_frame_num&63)<<1) + (i&1);
955+ }
956+ }
957+ }
958+
959 /* init with not available (for top right idx=7,15) */
960 memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) );
961
962@@ -418,19 +436,6 @@ void x264_macroblock_slice_init( x264_t *h )
963 h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta;
964 }
965
966- deblock_ref_table(-2) = -2;
967- deblock_ref_table(-1) = -1;
968- for( int i = 0; i < h->i_ref0 << h->sh.b_mbaff; i++ )
969- {
970- /* Mask off high bits to avoid frame num collisions with -1/-2.
971- * In current x264 frame num values don't cover a range of more
972- * than 32, so 6 bits is enough for uniqueness. */
973- if( !h->mb.b_interlaced )
974- deblock_ref_table(i) = h->fref0[i]->i_frame_num&63;
975- else
976- deblock_ref_table(i) = ((h->fref0[i>>1]->i_frame_num&63)<<1) + (i&1);
977- }
978-
979 h->mb.i_neighbour4[6] =
980 h->mb.i_neighbour4[9] =
981 h->mb.i_neighbour4[12] =
982@@ -894,7 +899,6 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
983 void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
984 {
985 int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
986- int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
987
988 h->mb.i_neighbour = 0;
989 h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
990@@ -906,9 +910,9 @@ void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_
991 h->mb.i_neighbour |= MB_LEFT;
992 }
993
994- if( top >= 0 )
995+ if( mb_y > h->mb.b_interlaced )
996 {
997- h->mb.i_mb_top_xy = top;
998+ h->mb.i_mb_top_xy = h->mb.i_mb_xy - (h->mb.i_mb_stride << h->mb.b_interlaced);
999 if( deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy] )
1000 h->mb.i_neighbour |= MB_TOP;
1001 }
1002@@ -930,8 +934,6 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
1003 h->mb.i_neighbour &= ~old_neighbour;
1004 if( h->mb.i_neighbour )
1005 {
1006- int left = h->mb.i_mb_left_xy;
1007- int top = h->mb.i_mb_top_xy;
1008 int top_y = mb_y - (1 << h->mb.b_interlaced);
1009 int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
1010 int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
1011@@ -941,10 +943,11 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
1012 uint8_t (*nnz)[24] = h->mb.non_zero_count;
1013
1014 if( h->mb.i_neighbour & MB_TOP )
1015- CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
1016+ CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] );
1017
1018 if( h->mb.i_neighbour & MB_LEFT )
1019 {
1020+ int left = h->mb.i_mb_left_xy;
1021 h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
1022 h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
1023 h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
1024--
10251.7.0.4
1026
1027
1028From bdc68d651db64045aecb28f27e0e05e027ab48eb Mon Sep 17 00:00:00 2001
1029From: Jason Garrett-Glaser <darkshikari@gmail.com>
1030Date: Fri, 28 May 2010 14:30:07 -0700
1031Subject: [PATCH 09/11] Re-enable i8x8 merged SATD
1032 Accidentally got disabled when intra_sad_x3 was added.
1033
1034---
1035 encoder/encoder.c | 1 +
1036 1 files changed, 1 insertions(+), 0 deletions(-)
1037
1038diff --git a/encoder/encoder.c b/encoder/encoder.c
1039index 6e0dc54..7717ea8 100644
1040--- a/encoder/encoder.c
1041+++ b/encoder/encoder.c
1042@@ -810,6 +810,7 @@ static void mbcmp_init( x264_t *h )
1043 memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
1044 h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
1045 h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
1046+ h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8;
1047 h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
1048 satd &= h->param.analyse.i_me_method == X264_ME_TESA;
1049 memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
1050--
10511.7.0.4
1052
1053
1054From c211bfffa59599e6a90df2e0fd00f4ae9e01ada0 Mon Sep 17 00:00:00 2001
1055From: Jason Garrett-Glaser <darkshikari@gmail.com>
1056Date: Thu, 27 May 2010 14:27:32 -0700
1057Subject: [PATCH 10/11] x86 assembly code for NAL escaping
1058 Up to ~10x faster than C depending on CPU.
1059 Helps the most at very high bitrates (e.g. lossless).
1060 Also make the C code faster and simpler.
1061
1062---
1063 Makefile | 4 +-
1064 common/bitstream.c | 92 ++++++++++++++
1065 common/bitstream.h | 299 ++++++++++++++++++++++++++++++++++++++++++++
1066 common/bs.h | 291 ------------------------------------------
1067 common/common.c | 54 --------
1068 common/common.h | 5 +-
1069 common/x86/bitstream-a.asm | 112 +++++++++++++++++
1070 common/x86/deblock-a.asm | 1 +
1071 encoder/encoder.c | 3 +-
1072 tools/checkasm.c | 52 ++++++++-
1073 10 files changed, 561 insertions(+), 352 deletions(-)
1074 create mode 100644 common/bitstream.c
1075 create mode 100644 common/bitstream.h
1076 delete mode 100644 common/bs.h
1077 create mode 100644 common/x86/bitstream-a.asm
1078
1079diff --git a/Makefile b/Makefile
1080index 0b43a3e..519e181 100644
1081--- a/Makefile
1082+++ b/Makefile
1083@@ -8,7 +8,7 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
1084 common/frame.c common/dct.c common/cpu.c common/cabac.c \
1085 common/common.c common/mdate.c common/rectangle.c \
1086 common/set.c common/quant.c common/deblock.c common/vlc.c \
1087- common/mvpred.c \
1088+ common/mvpred.c common/bitstream.c \
1089 encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
1090 encoder/set.c encoder/macroblock.c encoder/cabac.c \
1091 encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
1092@@ -52,7 +52,7 @@ endif
1093 ifneq ($(AS),)
1094 X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
1095 mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
1096- cpu-a.asm dct-32.asm
1097+ cpu-a.asm dct-32.asm bitstream-a.asm
1098 X86SRC = $(X86SRC0:%=common/x86/%)
1099
1100 ifeq ($(ARCH),X86)
1101diff --git a/common/bitstream.c b/common/bitstream.c
1102new file mode 100644
1103index 0000000..0aaac21
1104--- /dev/null
1105+++ b/common/bitstream.c
1106@@ -0,0 +1,92 @@
1107+/*****************************************************************************
1108+ * bitstream.c: h264 encoder library
1109+ *****************************************************************************
1110+ * Copyright (C) 2010 x264 project
1111+ *
1112+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
1113+ * Jason Garrett-Glaser <darkshikari@gmail.com>
1114+ *
1115+ * This program is free software; you can redistribute it and/or modify
1116+ * it under the terms of the GNU General Public License as published by
1117+ * the Free Software Foundation; either version 2 of the License, or
1118+ * (at your option) any later version.
1119+ *
1120+ * This program is distributed in the hope that it will be useful,
1121+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1122+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1123+ * GNU General Public License for more details.
1124+ *
1125+ * You should have received a copy of the GNU General Public License
1126+ * along with this program; if not, write to the Free Software
1127+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
1128+ *****************************************************************************/
1129+
1130+#include "common.h"
1131+
1132+static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
1133+{
1134+ if( src < end ) *dst++ = *src++;
1135+ if( src < end ) *dst++ = *src++;
1136+ while( src < end )
1137+ {
1138+ if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
1139+ *dst++ = 0x03;
1140+ *dst++ = *src++;
1141+ }
1142+ return dst;
1143+}
1144+
1145+#ifdef HAVE_MMX
1146+uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
1147+uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
1148+#endif
1149+
1150+/****************************************************************************
1151+ * x264_nal_encode:
1152+ ****************************************************************************/
1153+int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode )
1154+{
1155+ uint8_t *src = nal->p_payload;
1156+ uint8_t *end = nal->p_payload + nal->i_payload;
1157+ uint8_t *orig_dst = dst;
1158+
1159+ if( h->param.b_annexb )
1160+ {
1161+ if( b_long_startcode )
1162+ *dst++ = 0x00;
1163+ *dst++ = 0x00;
1164+ *dst++ = 0x00;
1165+ *dst++ = 0x01;
1166+ }
1167+ else /* save room for size later */
1168+ dst += 4;
1169+
1170+ /* nal header */
1171+ *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
1172+
1173+ dst = h->bsf.nal_escape( dst, src, end );
1174+ int size = (dst - orig_dst) - 4;
1175+
1176+ /* Write the size header for mp4/etc */
1177+ if( !h->param.b_annexb )
1178+ {
1179+ /* Size doesn't include the size of the header we're writing now. */
1180+ orig_dst[0] = size>>24;
1181+ orig_dst[1] = size>>16;
1182+ orig_dst[2] = size>> 8;
1183+ orig_dst[3] = size>> 0;
1184+ }
1185+
1186+ return size+4;
1187+}
1188+
1189+void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
1190+{
1191+ pf->nal_escape = x264_nal_escape_c;
1192+#ifdef HAVE_MMX
1193+ if( cpu&X264_CPU_MMXEXT )
1194+ pf->nal_escape = x264_nal_escape_mmxext;
1195+ if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
1196+ pf->nal_escape = x264_nal_escape_sse2;
1197+#endif
1198+}
1199diff --git a/common/bitstream.h b/common/bitstream.h
1200new file mode 100644
1201index 0000000..d018c7d
1202--- /dev/null
1203+++ b/common/bitstream.h
1204@@ -0,0 +1,299 @@
1205+/*****************************************************************************
1206+ * bitstream.h: h264 encoder library
1207+ *****************************************************************************
1208+ * Copyright (C) 2003-2008 x264 project
1209+ *
1210+ * Authors: Loren Merritt <lorenm@u.washington.edu>
1211+ * Jason Garrett-Glaser <darkshikari@gmail.com>
1212+ * Laurent Aimar <fenrir@via.ecp.fr>
1213+ *
1214+ * This program is free software; you can redistribute it and/or modify
1215+ * it under the terms of the GNU General Public License as published by
1216+ * the Free Software Foundation; either version 2 of the License, or
1217+ * (at your option) any later version.
1218+ *
1219+ * This program is distributed in the hope that it will be useful,
1220+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1221+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1222+ * GNU General Public License for more details.
1223+ *
1224+ * You should have received a copy of the GNU General Public License
1225+ * along with this program; if not, write to the Free Software
1226+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
1227+ *****************************************************************************/
1228+
1229+#ifndef X264_BS_H
1230+#define X264_BS_H
1231+
1232+typedef struct
1233+{
1234+ uint8_t i_bits;
1235+ uint8_t i_size;
1236+} vlc_t;
1237+
1238+typedef struct
1239+{
1240+ uint16_t i_bits;
1241+ uint8_t i_size;
1242+ /* Next level table to use */
1243+ uint8_t i_next;
1244+} vlc_large_t;
1245+
1246+typedef struct bs_s
1247+{
1248+ uint8_t *p_start;
1249+ uint8_t *p;
1250+ uint8_t *p_end;
1251+
1252+ intptr_t cur_bits;
1253+ int i_left; /* i_count number of available bits */
1254+ int i_bits_encoded; /* RD only */
1255+} bs_t;
1256+
1257+typedef struct
1258+{
1259+ int last;
1260+ int16_t level[16];
1261+ uint8_t run[16];
1262+} x264_run_level_t;
1263+
1264+extern const vlc_t x264_coeff0_token[5];
1265+extern const vlc_t x264_coeff_token[5][16][4];
1266+extern const vlc_t x264_total_zeros[15][16];
1267+extern const vlc_t x264_total_zeros_dc[3][4];
1268+extern const vlc_t x264_run_before[7][16];
1269+
1270+typedef struct
1271+{
1272+ uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end );
1273+} x264_bitstream_function_t;
1274+
1275+int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode );
1276+void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
1277+
1278+/* A larger level table size theoretically could help a bit at extremely
1279+ * high bitrates, but the cost in cache is usually too high for it to be
1280+ * useful.
1281+ * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
1282+ * FIXME: Do further testing? */
1283+#define LEVEL_TABLE_SIZE 128
1284+extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
1285+
1286+static inline void bs_init( bs_t *s, void *p_data, int i_data )
1287+{
1288+ int offset = ((intptr_t)p_data & 3);
1289+ s->p = s->p_start = (uint8_t*)p_data - offset;
1290+ s->p_end = (uint8_t*)p_data + i_data;
1291+ s->i_left = (WORD_SIZE - offset)*8;
1292+ s->cur_bits = endian_fix32( M32(s->p) );
1293+ s->cur_bits >>= (4-offset)*8;
1294+}
1295+static inline int bs_pos( bs_t *s )
1296+{
1297+ return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
1298+}
1299+
1300+/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
1301+static inline void bs_flush( bs_t *s )
1302+{
1303+ M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
1304+ s->p += WORD_SIZE - s->i_left / 8;
1305+ s->i_left = WORD_SIZE*8;
1306+}
1307+/* The inverse of bs_flush: prepare the bitstream to be written to again. */
1308+static inline void bs_realign( bs_t *s )
1309+{
1310+ int offset = ((intptr_t)s->p & 3);
1311+ if( offset )
1312+ {
1313+ s->p = (uint8_t*)s->p - offset;
1314+ s->i_left = (WORD_SIZE - offset)*8;
1315+ s->cur_bits = endian_fix32( M32(s->p) );
1316+ s->cur_bits >>= (4-offset)*8;
1317+ }
1318+}
1319+
1320+static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
1321+{
1322+ if( WORD_SIZE == 8 )
1323+ {
1324+ s->cur_bits = (s->cur_bits << i_count) | i_bits;
1325+ s->i_left -= i_count;
1326+ if( s->i_left <= 32 )
1327+ {
1328+#ifdef WORDS_BIGENDIAN
1329+ M32( s->p ) = s->cur_bits >> (32 - s->i_left);
1330+#else
1331+ M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
1332+#endif
1333+ s->i_left += 32;
1334+ s->p += 4;
1335+ }
1336+ }
1337+ else
1338+ {
1339+ if( i_count < s->i_left )
1340+ {
1341+ s->cur_bits = (s->cur_bits << i_count) | i_bits;
1342+ s->i_left -= i_count;
1343+ }
1344+ else
1345+ {
1346+ i_count -= s->i_left;
1347+ s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
1348+ M32( s->p ) = endian_fix( s->cur_bits );
1349+ s->p += 4;
1350+ s->cur_bits = i_bits;
1351+ s->i_left = 32 - i_count;
1352+ }
1353+ }
1354+}
1355+
1356+/* Special case to eliminate branch in normal bs_write. */
1357+/* Golomb never writes an even-size code, so this is only used in slice headers. */
1358+static inline void bs_write32( bs_t *s, uint32_t i_bits )
1359+{
1360+ bs_write( s, 16, i_bits >> 16 );
1361+ bs_write( s, 16, i_bits );
1362+}
1363+
1364+static inline void bs_write1( bs_t *s, uint32_t i_bit )
1365+{
1366+ s->cur_bits <<= 1;
1367+ s->cur_bits |= i_bit;
1368+ s->i_left--;
1369+ if( s->i_left == WORD_SIZE*8-32 )
1370+ {
1371+ M32( s->p ) = endian_fix32( s->cur_bits );
1372+ s->p += 4;
1373+ s->i_left = WORD_SIZE*8;
1374+ }
1375+}
1376+
1377+static inline void bs_align_0( bs_t *s )
1378+{
1379+ bs_write( s, s->i_left&7, 0 );
1380+ bs_flush( s );
1381+}
1382+static inline void bs_align_1( bs_t *s )
1383+{
1384+ bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
1385+ bs_flush( s );
1386+}
1387+static inline void bs_align_10( bs_t *s )
1388+{
1389+ if( s->i_left&7 )
1390+ bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
1391+}
1392+
1393+/* golomb functions */
1394+
1395+static const uint8_t x264_ue_size_tab[256] =
1396+{
1397+ 1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
1398+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
1399+ 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
1400+ 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
1401+ 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1402+ 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1403+ 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1404+ 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1405+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1406+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1407+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1408+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1409+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1410+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1411+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1412+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1413+};
1414+
1415+static inline void bs_write_ue_big( bs_t *s, unsigned int val )
1416+{
1417+ int size = 0;
1418+ int tmp = ++val;
1419+ if( tmp >= 0x10000 )
1420+ {
1421+ size = 32;
1422+ tmp >>= 16;
1423+ }
1424+ if( tmp >= 0x100 )
1425+ {
1426+ size += 16;
1427+ tmp >>= 8;
1428+ }
1429+ size += x264_ue_size_tab[tmp];
1430+ bs_write( s, size>>1, 0 );
1431+ bs_write( s, (size>>1)+1, val );
1432+}
1433+
1434+/* Only works on values under 255. */
1435+static inline void bs_write_ue( bs_t *s, int val )
1436+{
1437+ bs_write( s, x264_ue_size_tab[val+1], val+1 );
1438+}
1439+
1440+static inline void bs_write_se( bs_t *s, int val )
1441+{
1442+ int size = 0;
1443+ /* Faster than (val <= 0 ? -val*2+1 : val*2) */
1444+ /* 4 instructions on x86, 3 on ARM */
1445+ int tmp = 1 - val*2;
1446+ if( tmp < 0 ) tmp = val*2;
1447+ val = tmp;
1448+
1449+ if( tmp >= 0x100 )
1450+ {
1451+ size = 16;
1452+ tmp >>= 8;
1453+ }
1454+ size += x264_ue_size_tab[tmp];
1455+ bs_write( s, size, val );
1456+}
1457+
1458+static inline void bs_write_te( bs_t *s, int x, int val )
1459+{
1460+ if( x == 1 )
1461+ bs_write1( s, 1^val );
1462+ else //if( x > 1 )
1463+ bs_write_ue( s, val );
1464+}
1465+
1466+static inline void bs_rbsp_trailing( bs_t *s )
1467+{
1468+ bs_write1( s, 1 );
1469+ bs_write( s, s->i_left&7, 0 );
1470+}
1471+
1472+static ALWAYS_INLINE int bs_size_ue( unsigned int val )
1473+{
1474+ return x264_ue_size_tab[val+1];
1475+}
1476+
1477+static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
1478+{
1479+ if( val < 255 )
1480+ return x264_ue_size_tab[val+1];
1481+ else
1482+ return x264_ue_size_tab[(val+1)>>8] + 16;
1483+}
1484+
1485+static ALWAYS_INLINE int bs_size_se( int val )
1486+{
1487+ int tmp = 1 - val*2;
1488+ if( tmp < 0 ) tmp = val*2;
1489+ if( tmp < 256 )
1490+ return x264_ue_size_tab[tmp];
1491+ else
1492+ return x264_ue_size_tab[tmp>>8]+16;
1493+}
1494+
1495+static ALWAYS_INLINE int bs_size_te( int x, int val )
1496+{
1497+ if( x == 1 )
1498+ return 1;
1499+ else //if( x > 1 )
1500+ return x264_ue_size_tab[val+1];
1501+}
1502+
1503+#endif
1504diff --git a/common/bs.h b/common/bs.h
1505deleted file mode 100644
1506index 343a3c9..0000000
1507--- a/common/bs.h
1508+++ /dev/null
1509@@ -1,291 +0,0 @@
1510-/*****************************************************************************
1511- * bs.h :
1512- *****************************************************************************
1513- * Copyright (C) 2003-2008 x264 project
1514- *
1515- * Authors: Loren Merritt <lorenm@u.washington.edu>
1516- * Jason Garrett-Glaser <darkshikari@gmail.com>
1517- * Laurent Aimar <fenrir@via.ecp.fr>
1518- *
1519- * This program is free software; you can redistribute it and/or modify
1520- * it under the terms of the GNU General Public License as published by
1521- * the Free Software Foundation; either version 2 of the License, or
1522- * (at your option) any later version.
1523- *
1524- * This program is distributed in the hope that it will be useful,
1525- * but WITHOUT ANY WARRANTY; without even the implied warranty of
1526- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1527- * GNU General Public License for more details.
1528- *
1529- * You should have received a copy of the GNU General Public License
1530- * along with this program; if not, write to the Free Software
1531- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
1532- *****************************************************************************/
1533-
1534-#ifndef X264_BS_H
1535-#define X264_BS_H
1536-
1537-typedef struct
1538-{
1539- uint8_t i_bits;
1540- uint8_t i_size;
1541-} vlc_t;
1542-
1543-typedef struct
1544-{
1545- uint16_t i_bits;
1546- uint8_t i_size;
1547- /* Next level table to use */
1548- uint8_t i_next;
1549-} vlc_large_t;
1550-
1551-typedef struct bs_s
1552-{
1553- uint8_t *p_start;
1554- uint8_t *p;
1555- uint8_t *p_end;
1556-
1557- intptr_t cur_bits;
1558- int i_left; /* i_count number of available bits */
1559- int i_bits_encoded; /* RD only */
1560-} bs_t;
1561-
1562-typedef struct
1563-{
1564- int last;
1565- int16_t level[16];
1566- uint8_t run[16];
1567-} x264_run_level_t;
1568-
1569-extern const vlc_t x264_coeff0_token[5];
1570-extern const vlc_t x264_coeff_token[5][16][4];
1571-extern const vlc_t x264_total_zeros[15][16];
1572-extern const vlc_t x264_total_zeros_dc[3][4];
1573-extern const vlc_t x264_run_before[7][16];
1574-
1575-/* A larger level table size theoretically could help a bit at extremely
1576- * high bitrates, but the cost in cache is usually too high for it to be
1577- * useful.
1578- * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
1579- * FIXME: Do further testing? */
1580-#define LEVEL_TABLE_SIZE 128
1581-extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
1582-
1583-static inline void bs_init( bs_t *s, void *p_data, int i_data )
1584-{
1585- int offset = ((intptr_t)p_data & 3);
1586- s->p = s->p_start = (uint8_t*)p_data - offset;
1587- s->p_end = (uint8_t*)p_data + i_data;
1588- s->i_left = (WORD_SIZE - offset)*8;
1589- s->cur_bits = endian_fix32( M32(s->p) );
1590- s->cur_bits >>= (4-offset)*8;
1591-}
1592-static inline int bs_pos( bs_t *s )
1593-{
1594- return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
1595-}
1596-
1597-/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
1598-static inline void bs_flush( bs_t *s )
1599-{
1600- M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
1601- s->p += WORD_SIZE - s->i_left / 8;
1602- s->i_left = WORD_SIZE*8;
1603-}
1604-/* The inverse of bs_flush: prepare the bitstream to be written to again. */
1605-static inline void bs_realign( bs_t *s )
1606-{
1607- int offset = ((intptr_t)s->p & 3);
1608- if( offset )
1609- {
1610- s->p = (uint8_t*)s->p - offset;
1611- s->i_left = (WORD_SIZE - offset)*8;
1612- s->cur_bits = endian_fix32( M32(s->p) );
1613- s->cur_bits >>= (4-offset)*8;
1614- }
1615-}
1616-
1617-static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
1618-{
1619- if( WORD_SIZE == 8 )
1620- {
1621- s->cur_bits = (s->cur_bits << i_count) | i_bits;
1622- s->i_left -= i_count;
1623- if( s->i_left <= 32 )
1624- {
1625-#ifdef WORDS_BIGENDIAN
1626- M32( s->p ) = s->cur_bits >> (32 - s->i_left);
1627-#else
1628- M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
1629-#endif
1630- s->i_left += 32;
1631- s->p += 4;
1632- }
1633- }
1634- else
1635- {
1636- if( i_count < s->i_left )
1637- {
1638- s->cur_bits = (s->cur_bits << i_count) | i_bits;
1639- s->i_left -= i_count;
1640- }
1641- else
1642- {
1643- i_count -= s->i_left;
1644- s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
1645- M32( s->p ) = endian_fix( s->cur_bits );
1646- s->p += 4;
1647- s->cur_bits = i_bits;
1648- s->i_left = 32 - i_count;
1649- }
1650- }
1651-}
1652-
1653-/* Special case to eliminate branch in normal bs_write. */
1654-/* Golomb never writes an even-size code, so this is only used in slice headers. */
1655-static inline void bs_write32( bs_t *s, uint32_t i_bits )
1656-{
1657- bs_write( s, 16, i_bits >> 16 );
1658- bs_write( s, 16, i_bits );
1659-}
1660-
1661-static inline void bs_write1( bs_t *s, uint32_t i_bit )
1662-{
1663- s->cur_bits <<= 1;
1664- s->cur_bits |= i_bit;
1665- s->i_left--;
1666- if( s->i_left == WORD_SIZE*8-32 )
1667- {
1668- M32( s->p ) = endian_fix32( s->cur_bits );
1669- s->p += 4;
1670- s->i_left = WORD_SIZE*8;
1671- }
1672-}
1673-
1674-static inline void bs_align_0( bs_t *s )
1675-{
1676- bs_write( s, s->i_left&7, 0 );
1677- bs_flush( s );
1678-}
1679-static inline void bs_align_1( bs_t *s )
1680-{
1681- bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
1682- bs_flush( s );
1683-}
1684-static inline void bs_align_10( bs_t *s )
1685-{
1686- if( s->i_left&7 )
1687- bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
1688-}
1689-
1690-/* golomb functions */
1691-
1692-static const uint8_t x264_ue_size_tab[256] =
1693-{
1694- 1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
1695- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
1696- 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
1697- 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
1698- 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1699- 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1700- 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1701- 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1702- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1703- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1704- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1705- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1706- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1707- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1708- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1709- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1710-};
1711-
1712-static inline void bs_write_ue_big( bs_t *s, unsigned int val )
1713-{
1714- int size = 0;
1715- int tmp = ++val;
1716- if( tmp >= 0x10000 )
1717- {
1718- size = 32;
1719- tmp >>= 16;
1720- }
1721- if( tmp >= 0x100 )
1722- {
1723- size += 16;
1724- tmp >>= 8;
1725- }
1726- size += x264_ue_size_tab[tmp];
1727- bs_write( s, size>>1, 0 );
1728- bs_write( s, (size>>1)+1, val );
1729-}
1730-
1731-/* Only works on values under 255. */
1732-static inline void bs_write_ue( bs_t *s, int val )
1733-{
1734- bs_write( s, x264_ue_size_tab[val+1], val+1 );
1735-}
1736-
1737-static inline void bs_write_se( bs_t *s, int val )
1738-{
1739- int size = 0;
1740- /* Faster than (val <= 0 ? -val*2+1 : val*2) */
1741- /* 4 instructions on x86, 3 on ARM */
1742- int tmp = 1 - val*2;
1743- if( tmp < 0 ) tmp = val*2;
1744- val = tmp;
1745-
1746- if( tmp >= 0x100 )
1747- {
1748- size = 16;
1749- tmp >>= 8;
1750- }
1751- size += x264_ue_size_tab[tmp];
1752- bs_write( s, size, val );
1753-}
1754-
1755-static inline void bs_write_te( bs_t *s, int x, int val )
1756-{
1757- if( x == 1 )
1758- bs_write1( s, 1^val );
1759- else //if( x > 1 )
1760- bs_write_ue( s, val );
1761-}
1762-
1763-static inline void bs_rbsp_trailing( bs_t *s )
1764-{
1765- bs_write1( s, 1 );
1766- bs_write( s, s->i_left&7, 0 );
1767-}
1768-
1769-static ALWAYS_INLINE int bs_size_ue( unsigned int val )
1770-{
1771- return x264_ue_size_tab[val+1];
1772-}
1773-
1774-static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
1775-{
1776- if( val < 255 )
1777- return x264_ue_size_tab[val+1];
1778- else
1779- return x264_ue_size_tab[(val+1)>>8] + 16;
1780-}
1781-
1782-static ALWAYS_INLINE int bs_size_se( int val )
1783-{
1784- int tmp = 1 - val*2;
1785- if( tmp < 0 ) tmp = val*2;
1786- if( tmp < 256 )
1787- return x264_ue_size_tab[tmp];
1788- else
1789- return x264_ue_size_tab[tmp>>8]+16;
1790-}
1791-
1792-static ALWAYS_INLINE int bs_size_te( int x, int val )
1793-{
1794- if( x == 1 )
1795- return 1;
1796- else //if( x > 1 )
1797- return x264_ue_size_tab[val+1];
1798-}
1799-
1800-#endif
1801diff --git a/common/common.c b/common/common.c
1802index fccf2b0..2458f65 100644
1803--- a/common/common.c
1804+++ b/common/common.c
1805@@ -1027,60 +1027,6 @@ void x264_picture_clean( x264_picture_t *pic )
1806 }
1807
1808 /****************************************************************************
1809- * x264_nal_encode:
1810- ****************************************************************************/
1811-int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode )
1812-{
1813- uint8_t *src = nal->p_payload;
1814- uint8_t *end = nal->p_payload + nal->i_payload;
1815- uint8_t *orig_dst = dst;
1816- int i_count = 0, size;
1817-
1818- if( b_annexb )
1819- {
1820- if( b_long_startcode )
1821- *dst++ = 0x00;
1822- *dst++ = 0x00;
1823- *dst++ = 0x00;
1824- *dst++ = 0x01;
1825- }
1826- else /* save room for size later */
1827- dst += 4;
1828-
1829- /* nal header */
1830- *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
1831-
1832- while( src < end )
1833- {
1834- if( i_count == 2 && *src <= 0x03 )
1835- {
1836- *dst++ = 0x03;
1837- i_count = 0;
1838- }
1839- if( *src == 0 )
1840- i_count++;
1841- else
1842- i_count = 0;
1843- *dst++ = *src++;
1844- }
1845- size = (dst - orig_dst) - 4;
1846-
1847- /* Write the size header for mp4/etc */
1848- if( !b_annexb )
1849- {
1850- /* Size doesn't include the size of the header we're writing now. */
1851- orig_dst[0] = size>>24;
1852- orig_dst[1] = size>>16;
1853- orig_dst[2] = size>> 8;
1854- orig_dst[3] = size>> 0;
1855- }
1856-
1857- return size+4;
1858-}
1859-
1860-
1861-
1862-/****************************************************************************
1863 * x264_malloc:
1864 ****************************************************************************/
1865 void *x264_malloc( int i_size )
1866diff --git a/common/common.h b/common/common.h
1867index 539ea65..93712fe 100644
1868--- a/common/common.h
1869+++ b/common/common.h
1870@@ -137,7 +137,7 @@ static const int x264_scan8[16+2*4+3] =
1871 */
1872
1873 #include "x264.h"
1874-#include "bs.h"
1875+#include "bitstream.h"
1876 #include "set.h"
1877 #include "predict.h"
1878 #include "pixel.h"
1879@@ -166,8 +166,6 @@ int64_t x264_mdate( void );
1880 * the encoding options */
1881 char *x264_param2string( x264_param_t *p, int b_res );
1882
1883-int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode );
1884-
1885 /* log */
1886 void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
1887
1888@@ -796,6 +794,7 @@ struct x264_t
1889 x264_zigzag_function_t zigzagf;
1890 x264_quant_function_t quantf;
1891 x264_deblock_function_t loopf;
1892+ x264_bitstream_function_t bsf;
1893
1894 #ifdef HAVE_VISUALIZE
1895 struct visualize_t *visualize;
1896diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
1897new file mode 100644
1898index 0000000..1fb4cea
1899--- /dev/null
1900+++ b/common/x86/bitstream-a.asm
1901@@ -0,0 +1,112 @@
1902+;*****************************************************************************
1903+;* bitstream-a.asm: h264 encoder library
1904+;*****************************************************************************
1905+;* Copyright (C) 2010 x264 project
1906+;*
1907+;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
1908+;*
1909+;* This program is free software; you can redistribute it and/or modify
1910+;* it under the terms of the GNU General Public License as published by
1911+;* the Free Software Foundation; either version 2 of the License, or
1912+;* (at your option) any later version.
1913+;*
1914+;* This program is distributed in the hope that it will be useful,
1915+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
1916+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1917+;* GNU General Public License for more details.
1918+;*
1919+;* You should have received a copy of the GNU General Public License
1920+;* along with this program; if not, write to the Free Software
1921+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
1922+;*****************************************************************************
1923+
1924+%include "x86inc.asm"
1925+%include "x86util.asm"
1926+
1927+SECTION .text
1928+
1929+;-----------------------------------------------------------------------------
1930+; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
1931+;-----------------------------------------------------------------------------
1932+
1933+%macro NAL_LOOP 2
1934+ALIGN 16
1935+%1:
1936+ mova m0, [r1+r2]
1937+ mova m1, m0
1938+%if mmsize == 8
1939+ psrlq m0, 8
1940+%else
1941+ psrldq m0, 1
1942+%endif
1943+ %2 [r0+r1], m1
1944+ por m1, m0
1945+ pcmpeqb m1, m2
1946+ pmovmskb r3d, m1
1947+ test r3d, r3d
1948+ jnz .escape
1949+ add r1, mmsize
1950+ jl %1
1951+%endmacro
1952+
1953+%macro NAL_ESCAPE 1
1954+
1955+cglobal nal_escape_%1, 3,5
1956+ pxor m2, m2
1957+ sub r1, r2 ; r1 = offset of current src pointer from end of src
1958+ sub r0, r1 ; r0 = projected end of dst, assuming no more escapes
1959+
1960+ mov r3w, [r1+r2]
1961+ mov [r0+r1], r3w
1962+ add r1, 2
1963+ jge .ret
1964+
1965+ ; Start off by jumping into the escape loop in
1966+ ; case there's an escape at the start.
1967+ ; And do a few more in scalar until src is aligned again.
1968+ lea r4d, [r1+r2]
1969+ or r4d, -mmsize
1970+ neg r4d
1971+ jmp .escapeloop
1972+
1973+ NAL_LOOP .loop_aligned, mova
1974+%if mmsize==16
1975+ NAL_LOOP .loop_unaligned, movu
1976+%endif
1977+
1978+.ret:
1979+ movifnidn rax, r0
1980+ RET
1981+ALIGN 16
1982+.escape:
1983+ mov r4d, mmsize
1984+.escapeloop:
1985+ mov r3b, [r1+r2]
1986+ cmp r3b, 3
1987+ jna .escape_check
1988+.copy:
1989+ mov [r0+r1], r3b
1990+ inc r1
1991+ jge .ret
1992+ dec r4d
1993+ jg .escapeloop
1994+ cmp byte [r1+r2-1], 0 ; Don't go back to the main loop until we're out of a zero-run.
1995+ jz .escape
1996+%if mmsize==16
1997+ lea r4d, [r0+r1]
1998+ test r4d, mmsize-1
1999+ jnz .loop_unaligned
2000+%endif
2001+ jmp .loop_aligned
2002+.escape_check:
2003+ cmp word [r0+r1-2], 0
2004+ jnz .copy
2005+ mov byte [r0+r1], 3
2006+ inc r0
2007+ jmp .copy
2008+%endmacro
2009+
2010+INIT_MMX
2011+NAL_ESCAPE mmxext
2012+INIT_XMM
2013+NAL_ESCAPE sse2
2014diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
2015index aedd688..3a31e26 100644
2016--- a/common/x86/deblock-a.asm
2017+++ b/common/x86/deblock-a.asm
2018@@ -4,6 +4,7 @@
2019 ;* Copyright (C) 2005-2008 x264 project
2020 ;*
2021 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
2022+;* Jason Garrett-Glaser <darkshikari@gmail.com>
2023 ;*
2024 ;* This program is free software; you can redistribute it and/or modify
2025 ;* it under the terms of the GNU General Public License as published by
2026diff --git a/encoder/encoder.c b/encoder/encoder.c
2027index 7717ea8..2f9e7f6 100644
2028--- a/encoder/encoder.c
2029+++ b/encoder/encoder.c
2030@@ -987,6 +987,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
2031 x264_mc_init( h->param.cpu, &h->mc );
2032 x264_quant_init( h, h->param.cpu, &h->quantf );
2033 x264_deblock_init( h->param.cpu, &h->loopf );
2034+ x264_bitstream_init( h->param.cpu, &h->bsf );
2035 x264_dct_init_weights();
2036
2037 mbcmp_init( h );
2038@@ -1273,7 +1274,7 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start )
2039 for( int i = start; i < h->out.i_nal; i++ )
2040 {
2041 int long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS;
2042- int size = x264_nal_encode( nal_buffer, &h->out.nal[i], h->param.b_annexb, long_startcode );
2043+ int size = x264_nal_encode( h, nal_buffer, &h->out.nal[i], long_startcode );
2044 h->out.nal[i].i_payload = size;
2045 h->out.nal[i].p_payload = nal_buffer;
2046 nal_buffer += size;
2047diff --git a/tools/checkasm.c b/tools/checkasm.c
2048index a0a9d54..ea6f209 100644
2049--- a/tools/checkasm.c
2050+++ b/tools/checkasm.c
2051@@ -1661,6 +1661,55 @@ static int check_cabac( int cpu_ref, int cpu_new )
2052 return ret;
2053 }
2054
2055+static int check_bitstream( int cpu_ref, int cpu_new )
2056+{
2057+ x264_bitstream_function_t bs_c;
2058+ x264_bitstream_function_t bs_ref;
2059+ x264_bitstream_function_t bs_a;
2060+
2061+ int ret = 0, ok = 1, used_asm = 0;
2062+
2063+ x264_bitstream_init( 0, &bs_c );
2064+ x264_bitstream_init( cpu_ref, &bs_ref );
2065+ x264_bitstream_init( cpu_new, &bs_a );
2066+ if( bs_a.nal_escape != bs_ref.nal_escape )
2067+ {
2068+ int size = 0x4000;
2069+ uint8_t *input = malloc(size+100);
2070+ uint8_t *output1 = malloc(size*2);
2071+ uint8_t *output2 = malloc(size*2);
2072+ used_asm = 1;
2073+ set_func_name( "nal_escape" );
2074+ for( int i = 0; i < 100; i++ )
2075+ {
2076+ /* Test corner-case sizes */
2077+ int test_size = i < 10 ? i+1 : rand() & 0x3fff;
2078+ for( int j = 0; j < test_size; j++ )
2079+ input[j] = (rand()&1) * rand();
2080+ uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
2081+ uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
2082+ int size_c = end_c-output1;
2083+ int size_a = end_a-output2;
2084+ if( size_c != size_a || memcmp( output1, output2, size_c ) )
2085+ {
2086+ fprintf( stderr, "nal_escape : [FAILED] %d %d\n", size_c, size_a );
2087+ ok = 0;
2088+ break;
2089+ }
2090+ }
2091+ for( int j = 0; j < size; j++ )
2092+ input[j] = rand();
2093+ call_c2( bs_c.nal_escape, output1, input, input+size );
2094+ call_a2( bs_a.nal_escape, output2, input, input+size );
2095+ free(input);
2096+ free(output1);
2097+ free(output2);
2098+ }
2099+ report( "nal escape:" );
2100+
2101+ return ret;
2102+}
2103+
2104 static int check_all_funcs( int cpu_ref, int cpu_new )
2105 {
2106 return check_pixel( cpu_ref, cpu_new )
2107@@ -1669,7 +1718,8 @@ static int check_all_funcs( int cpu_ref, int cpu_new )
2108 + check_intra( cpu_ref, cpu_new )
2109 + check_deblock( cpu_ref, cpu_new )
2110 + check_quant( cpu_ref, cpu_new )
2111- + check_cabac( cpu_ref, cpu_new );
2112+ + check_cabac( cpu_ref, cpu_new )
2113+ + check_bitstream( cpu_ref, cpu_new );
2114 }
2115
2116 static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
2117--
21181.7.0.4
2119
2120
2121From 9efc381b344f784285e10cf6a836f9efdf1035b8 Mon Sep 17 00:00:00 2001
2122From: Jason Garrett-Glaser <darkshikari@gmail.com>
2123Date: Fri, 28 May 2010 14:27:22 -0700
2124Subject: [PATCH 11/11] Add API tool to apply arbitrary quantizer offsets
2125 The calling application can now pass a "map" of quantizer offsets to apply to each frame.
2126 An optional callback to free the map can also be included.
2127 This allows all kinds of flexible region-of-interest coding and similar.
2128
2129---
2130 common/common.c | 2 +-
2131 encoder/encoder.c | 7 +++++--
2132 encoder/ratecontrol.c | 36 +++++++++++++++++++++++++-----------
2133 encoder/ratecontrol.h | 4 ++--
2134 x264.h | 20 +++++++++++++++++++-
2135 5 files changed, 52 insertions(+), 17 deletions(-)
2136
2137diff --git a/common/common.c b/common/common.c
2138index 2458f65..48e1bbc 100644
2139--- a/common/common.c
2140+++ b/common/common.c
2141@@ -998,6 +998,7 @@ static void x264_log_default( void *p_unused, int i_level, const char *psz_fmt,
2142 ****************************************************************************/
2143 int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
2144 {
2145+ memset( pic, 0, sizeof( x264_picture_t ) );
2146 pic->i_type = X264_TYPE_AUTO;
2147 pic->i_qpplus1 = 0;
2148 pic->img.i_csp = i_csp;
2149@@ -1010,7 +1011,6 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
2150 pic->img.i_stride[0] = i_width;
2151 pic->img.i_stride[1] = i_width / 2;
2152 pic->img.i_stride[2] = i_width / 2;
2153- pic->param = NULL;
2154 pic->i_pic_struct = PIC_STRUCT_AUTO;
2155 return 0;
2156 }
2157diff --git a/encoder/encoder.c b/encoder/encoder.c
2158index 2f9e7f6..89107a3 100644
2159--- a/encoder/encoder.c
2160+++ b/encoder/encoder.c
2161@@ -2250,11 +2250,14 @@ int x264_encoder_encode( x264_t *h,
2162
2163 if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
2164 {
2165- if( x264_macroblock_tree_read( h, fenc ) )
2166+ if( x264_macroblock_tree_read( h, fenc, pic_in->prop.quant_offsets ) )
2167 return -1;
2168 }
2169 else
2170- x264_adaptive_quant_frame( h, fenc );
2171+ x264_adaptive_quant_frame( h, fenc, pic_in->prop.quant_offsets );
2172+
2173+ if( pic_in->prop.quant_offsets_free )
2174+ pic_in->prop.quant_offsets_free( pic_in->prop.quant_offsets );
2175
2176 if( h->frames.b_have_lowres )
2177 x264_frame_init_lowres( h, fenc );
2178diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
2179index bf0a400..d09de98 100644
2180--- a/encoder/ratecontrol.c
2181+++ b/encoder/ratecontrol.c
2182@@ -235,7 +235,7 @@ static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_
2183 return var;
2184 }
2185
2186-void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
2187+void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets )
2188 {
2189 /* constants chosen to result in approximately the same overall bitrate as without AQ.
2190 * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
2191@@ -256,11 +256,22 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
2192 /* Need to init it anyways for MB tree */
2193 if( h->param.rc.f_aq_strength == 0 )
2194 {
2195- memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
2196- memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
2197- if( h->frames.b_have_lowres )
2198+ if( quant_offsets )
2199+ {
2200 for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
2201- frame->i_inv_qscale_factor[mb_xy] = 256;
2202+ frame->f_qp_offset[mb_xy] = frame->f_qp_offset_aq[mb_xy] = quant_offsets[mb_xy];
2203+ if( h->frames.b_have_lowres )
2204+ for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
2205+ frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8( frame->f_qp_offset[mb_xy] );
2206+ }
2207+ else
2208+ {
2209+ memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
2210+ memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
2211+ if( h->frames.b_have_lowres )
2212+ for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
2213+ frame->i_inv_qscale_factor[mb_xy] = 256;
2214+ }
2215 }
2216 /* Need variance data for weighted prediction */
2217 if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
2218@@ -299,9 +310,10 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
2219 for( int mb_x = 0; mb_x < width; mb_x++ )
2220 {
2221 float qp_adj;
2222+ int mb_xy = mb_x + mb_y*h->mb.i_mb_stride;
2223 if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
2224 {
2225- qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
2226+ qp_adj = frame->f_qp_offset[mb_xy];
2227 qp_adj = strength * (qp_adj - avg_adj);
2228 }
2229 else
2230@@ -309,10 +321,12 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
2231 uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
2232 qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
2233 }
2234- frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
2235- frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
2236+ if( quant_offsets )
2237+ qp_adj += quant_offsets[mb_xy];
2238+ frame->f_qp_offset[mb_xy] =
2239+ frame->f_qp_offset_aq[mb_xy] = qp_adj;
2240 if( h->frames.b_have_lowres )
2241- frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
2242+ frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8(qp_adj);
2243 }
2244 }
2245
2246@@ -327,7 +341,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
2247 }
2248 }
2249
2250-int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
2251+int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets )
2252 {
2253 x264_ratecontrol_t *rc = h->rc;
2254 uint8_t i_type_actual = rc->entry[frame->i_frame].pict_type;
2255@@ -363,7 +377,7 @@ int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
2256 rc->qpbuf_pos--;
2257 }
2258 else
2259- x264_adaptive_quant_frame( h, frame );
2260+ x264_adaptive_quant_frame( h, frame, quant_offsets );
2261 return 0;
2262 fail:
2263 x264_log(h, X264_LOG_ERROR, "Incomplete MB-tree stats file.\n");
2264diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
2265index e052b2a..dd139eb 100644
2266--- a/encoder/ratecontrol.h
2267+++ b/encoder/ratecontrol.h
2268@@ -29,8 +29,8 @@ void x264_ratecontrol_delete( x264_t * );
2269
2270 void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
2271
2272-void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
2273-int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
2274+void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets );
2275+int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets );
2276 int x264_reference_build_list_optimal( x264_t *h );
2277 void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
2278 void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead );
2279diff --git a/x264.h b/x264.h
2280index 95efd88..a4b3400 100644
2281--- a/x264.h
2282+++ b/x264.h
2283@@ -35,7 +35,7 @@
2284
2285 #include <stdarg.h>
2286
2287-#define X264_BUILD 96
2288+#define X264_BUILD 97
2289
2290 /* x264_t:
2291 * opaque handler for encoder */
2292@@ -508,6 +508,22 @@ typedef struct
2293
2294 typedef struct
2295 {
2296+ /* In: an array of quantizer offsets to be applied to this image during encoding.
2297+ * These are added on top of the decisions made by x264.
2298+ * Offsets can be fractional; they are added before QPs are rounded to integer.
2299+ * Adaptive quantization must be enabled to use this feature. Behavior if quant
2300+ * offsets differ between encoding passes is undefined.
2301+ *
2302+ * Array contains one offset per macroblock, in raster scan order. In interlaced
2303+ * mode, top-field MBs and bottom-field MBs are interleaved at the row level. */
2304+ float *quant_offsets;
2305+ /* In: optional callback to free quant_offsets when used.
2306+ * Useful if one wants to use a different quant_offset array for each frame. */
2307+ void (*quant_offsets_free)( void* );
2308+} x264_image_properties_t;
2309+
2310+typedef struct
2311+{
2312 /* In: force picture type (if not auto)
2313 * If x264 encoding parameters are violated in the forcing of picture types,
2314 * x264 will correct the input picture type and log a warning.
2315@@ -537,6 +553,8 @@ typedef struct
2316 x264_param_t *param;
2317 /* In: raw data */
2318 x264_image_t img;
2319+ /* In: optional information to modify encoder decisions for this frame */
2320+ x264_image_properties_t prop;
2321 /* Out: HRD timing information. Output only when i_nal_hrd is set. */
2322 x264_hrd_t hrd_timing;
2323 /* private user data. libx264 doesn't touch this,
2324--
23251.7.0.4