· 8 years ago · May 28, 2017, 04:06 PM
1From f3677c61bc31dbe79d69dee092cba504c3f6f523 Mon Sep 17 00:00:00 2001
2From: Jason Garrett-Glaser <darkshikari@gmail.com>
3Date: Mon, 31 May 2010 11:14:22 -0700
4Subject: [PATCH 01/10] Fix cavlc+deblock+8x8dct (regression in r1612)
5 Add cavlc+8x8dct munging to new deblock system.
6 May have caused minor visual artifacts.
7
8---
9 common/deblock.c | 47 -----------------------------------------------
10 common/macroblock.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
11 2 files changed, 44 insertions(+), 49 deletions(-)
12
13diff --git a/common/deblock.c b/common/deblock.c
14index fc039c5..27c73ae 100644
15--- a/common/deblock.c
16+++ b/common/deblock.c
17@@ -24,46 +24,6 @@
18
19 #include "common.h"
20
21-/* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
22- * entropy coding, but per 64 coeffs for the purpose of deblocking */
23-static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
24-{
25- uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
26- int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
27- for( int x = 0; x<h->sps->i_mb_width; x++ )
28- {
29- memcpy( buf+x, src+x, 16 );
30- if( transform[x] )
31- {
32- int nnz = src[x][0] | src[x][1];
33- src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
34- nnz = src[x][2] | src[x][3];
35- src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
36- }
37- }
38-}
39-
40-static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
41-{
42- uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
43- for( int x = 0; x < h->sps->i_mb_width; x++ )
44- memcpy( dst+x, buf+x, 16 );
45-}
46-
47-static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
48-{
49- func( h, mb_y, buf );
50- if( mb_y > 0 )
51- func( h, mb_y-1, buf + h->sps->i_mb_width );
52- if( h->sh.b_mbaff )
53- {
54- func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
55- if( mb_y > 0 )
56- func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
57- }
58-}
59-
60-
61 /* Deblocking filter */
62 static const uint8_t i_alpha_table[52+12*2] =
63 {
64@@ -344,10 +304,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
65 int stride2y = stridey << b_interlaced;
66 int strideuv = h->fdec->i_stride[1];
67 int stride2uv = strideuv << b_interlaced;
68- uint8_t (*nnz_backup)[16] = h->scratch_buffer;
69-
70- if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
71- munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
72
73 for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
74 {
75@@ -427,9 +383,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
76 if( !transform_8x8 ) FILTER( , 1, 3, qp, qpc );
77 }
78 }
79-
80- if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
81- munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
82 }
83
84 #ifdef HAVE_MMX
85diff --git a/common/macroblock.c b/common/macroblock.c
86index ce510e9..1b2d37b 100644
87--- a/common/macroblock.c
88+++ b/common/macroblock.c
89@@ -344,8 +344,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
90 int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
91 int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
92 ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
93- int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
94- scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_nnz );
95+ scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
96 }
97 int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
98 scratch_size = X264_MAX( scratch_size, buf_mbtree );
99@@ -1013,6 +1012,49 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
100 M32( &h->mb.cache.ref[0][x264_scan8[0]+8*2] ) = refbot;
101 M32( &h->mb.cache.ref[0][x264_scan8[0]+8*3] ) = refbot;
102 }
103+
104+ /* Munge NNZ for cavlc + 8x8dct */
105+ if( !h->param.b_cabac && h->pps->b_transform_8x8_mode )
106+ {
107+ uint8_t (*nnz)[24] = h->mb.non_zero_count;
108+ int top = h->mb.i_mb_top_xy;
109+ int left = h->mb.i_mb_left_xy;
110+
111+ if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] )
112+ {
113+ int i8 = x264_scan8[0] - 8;
114+ int nnz_top0 = M16( &nnz[top][8] ) | M16( &nnz[top][12] );
115+ int nnz_top1 = M16( &nnz[top][10] ) | M16( &nnz[top][14] );
116+ M16( &h->mb.cache.non_zero_count[i8+0] ) = nnz_top0 ? 0x0101 : 0;
117+ M16( &h->mb.cache.non_zero_count[i8+2] ) = nnz_top1 ? 0x0101 : 0;
118+ }
119+
120+ if( h->mb.i_neighbour & MB_LEFT && h->mb.mb_transform_size[left] )
121+ {
122+ int i8 = x264_scan8[0] - 1;
123+ int nnz_left0 = M16( &nnz[left][2] ) | M16( &nnz[left][6] );
124+ int nnz_left1 = M16( &nnz[left][10] ) | M16( &nnz[left][14] );
125+ h->mb.cache.non_zero_count[i8+8*0] = !!nnz_left0;
126+ h->mb.cache.non_zero_count[i8+8*1] = !!nnz_left0;
127+ h->mb.cache.non_zero_count[i8+8*2] = !!nnz_left1;
128+ h->mb.cache.non_zero_count[i8+8*3] = !!nnz_left1;
129+ }
130+
131+ if( h->mb.mb_transform_size[h->mb.i_mb_xy] )
132+ {
133+ int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
134+ int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] );
135+ int nnz2 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[10]] );
136+ int nnz3 = M16( &h->mb.cache.non_zero_count[x264_scan8[12]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[14]] );
137+ uint32_t nnztop = pack16to32( !!nnz0, !!nnz1 ) * 0x0101;
138+ uint32_t nnzbot = pack16to32( !!nnz2, !!nnz3 ) * 0x0101;
139+
140+ M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*0] ) = nnztop;
141+ M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*1] ) = nnztop;
142+ M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*2] ) = nnzbot;
143+ M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*3] ) = nnzbot;
144+ }
145+ }
146 }
147
148 static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i )
149--
1501.7.0.4
151
152
153From 925b5fd15ac24ccbce54f5e2ff6119f8f4f4710c Mon Sep 17 00:00:00 2001
154From: Jason Garrett-Glaser <darkshikari@gmail.com>
155Date: Sun, 30 May 2010 09:42:53 -0700
156Subject: [PATCH 02/10] Fix ultrafast to actually turn off weightb
157
158---
159 common/common.c | 1 +
160 1 files changed, 1 insertions(+), 0 deletions(-)
161
162diff --git a/common/common.c b/common/common.c
163index 62bef99..fccf2b0 100644
164--- a/common/common.c
165+++ b/common/common.c
166@@ -183,6 +183,7 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
167 param->i_bframe_adaptive = X264_B_ADAPT_NONE;
168 param->rc.b_mb_tree = 0;
169 param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
170+ param->analyse.b_weighted_bipred = 0;
171 }
172 else if( !strcasecmp( preset, "superfast" ) )
173 {
174--
1751.7.0.4
176
177
178From 49a832188629fdea4269977a48102029a6300b8b Mon Sep 17 00:00:00 2001
179From: Jason Garrett-Glaser <darkshikari@gmail.com>
180Date: Thu, 27 May 2010 12:31:41 -0700
181Subject: [PATCH 03/10] Fix omission in libx264 tuning documentation
182
183---
184 x264.h | 2 +-
185 1 files changed, 1 insertions(+), 1 deletions(-)
186
187diff --git a/x264.h b/x264.h
188index 6d7b703..95efd88 100644
189--- a/x264.h
190+++ b/x264.h
191@@ -446,7 +446,7 @@ static const char * const x264_tune_names[] = { "film", "animation", "grain", "s
192
193 /* Multiple tunings can be used if separated by a delimiter in ",./-+",
194 * however multiple psy tunings cannot be used.
195- * film, animation, grain, psnr, and ssim are psy tunings.
196+ * film, animation, grain, stillimage, psnr, and ssim are psy tunings.
197 *
198 * returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
199 int x264_param_default_preset( x264_param_t *, const char *preset, const char *tune );
200--
2011.7.0.4
202
203
204From 69cda7770f3851d2c5785af74b82ba583794c7a6 Mon Sep 17 00:00:00 2001
205From: Jason Garrett-Glaser <darkshikari@gmail.com>
206Date: Wed, 26 May 2010 12:55:35 -0700
207Subject: [PATCH 04/10] Merge some of adaptive quant and weightp
208 Eliminate redundant work; both of them were calculating variance of the frame.
209
210---
211 common/frame.h | 4 +-
212 encoder/analyse.h | 1 -
213 encoder/encoder.c | 12 ++---
214 encoder/ratecontrol.c | 124 +++++++++++++++++++++++++++++++-----------------
215 encoder/slicetype.c | 31 ++----------
216 5 files changed, 92 insertions(+), 80 deletions(-)
217
218diff --git a/common/frame.h b/common/frame.h
219index 91d27b5..ca5cb7a 100644
220--- a/common/frame.h
221+++ b/common/frame.h
222@@ -118,8 +118,8 @@ typedef struct x264_frame
223 uint16_t *i_inv_qscale_factor;
224 int b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
225 float f_weighted_cost_delta[X264_BFRAME_MAX+2];
226- uint32_t i_pixel_sum;
227- uint64_t i_pixel_ssd;
228+ uint32_t i_pixel_sum[3];
229+ uint64_t i_pixel_ssd[3];
230
231 /* hrd */
232 x264_hrd_t hrd_timing;
233diff --git a/encoder/analyse.h b/encoder/analyse.h
234index 7c2c22c..53e4c2e 100644
235--- a/encoder/analyse.h
236+++ b/encoder/analyse.h
237@@ -33,7 +33,6 @@ void x264_slicetype_decide( x264_t *h );
238 void x264_slicetype_analyse( x264_t *h, int keyframe );
239
240 int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
241-void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame );
242
243 int x264_lookahead_init( x264_t *h, int i_slicetype_length );
244 int x264_lookahead_is_empty( x264_t *h );
245diff --git a/encoder/encoder.c b/encoder/encoder.c
246index 52017ff..6e0dc54 100644
247--- a/encoder/encoder.c
248+++ b/encoder/encoder.c
249@@ -2246,21 +2246,17 @@ int x264_encoder_encode( x264_t *h,
250 fenc->i_pic_struct = PIC_STRUCT_PROGRESSIVE;
251 }
252
253- if( h->frames.b_have_lowres )
254- {
255- if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
256- x264_weight_plane_analyse( h, fenc );
257- x264_frame_init_lowres( h, fenc );
258- }
259-
260 if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
261 {
262 if( x264_macroblock_tree_read( h, fenc ) )
263 return -1;
264 }
265- else if( h->param.rc.i_aq_mode )
266+ else
267 x264_adaptive_quant_frame( h, fenc );
268
269+ if( h->frames.b_have_lowres )
270+ x264_frame_init_lowres( h, fenc );
271+
272 /* 2: Place the frame into the queue for its slice type decision */
273 x264_lookahead_put_frame( h, fenc );
274
275diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
276index a725a24..bf0a400 100644
277--- a/encoder/ratecontrol.c
278+++ b/encoder/ratecontrol.c
279@@ -215,12 +215,14 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
280 stride <<= h->mb.b_interlaced;
281 uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride );
282 uint32_t sum = (uint32_t)res;
283- uint32_t sqr = res >> 32;
284- return sqr - (sum * sum >> shift);
285+ uint32_t ssd = res >> 32;
286+ frame->i_pixel_sum[i] += sum;
287+ frame->i_pixel_ssd[i] += ssd;
288+ return ssd - (sum * sum >> shift);
289 }
290
291 // Find the total AC energy of the block in all planes.
292-static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
293+static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
294 {
295 /* This function contains annoying hacks because GCC has a habit of reordering emms
296 * and putting it after floating point ops. As a result, we put the emms at the end of the
297@@ -239,56 +241,90 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
298 * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
299 float strength;
300 float avg_adj = 0.f;
301- /* Need to init it anyways for MB tree. */
302- if( h->param.rc.f_aq_strength == 0 )
303- {
304- memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
305- memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
306- if( h->frames.b_have_lowres )
307- for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
308- frame->i_inv_qscale_factor[mb_xy] = 256;
309- return;
310+ int width = h->sps->i_mb_width;
311+ int height = h->sps->i_mb_height;
312+ /* Initialize frame stats */
313+ for( int i = 0; i < 3; i++ )
314+ {
315+ frame->i_pixel_sum[i] = 0;
316+ frame->i_pixel_ssd[i] = 0;
317 }
318
319- if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
320+ /* Degenerate cases */
321+ if( h->param.rc.i_aq_mode == X264_AQ_NONE || h->param.rc.f_aq_strength == 0 )
322 {
323- float avg_adj_pow2 = 0.f;
324- for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
325- for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
326- {
327- uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
328- float qp_adj = powf( energy + 1, 0.125f );
329- frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
330- avg_adj += qp_adj;
331- avg_adj_pow2 += qp_adj * qp_adj;
332- }
333- avg_adj /= h->mb.i_mb_count;
334- avg_adj_pow2 /= h->mb.i_mb_count;
335- strength = h->param.rc.f_aq_strength * avg_adj;
336- avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
337+ /* Need to init it anyways for MB tree */
338+ if( h->param.rc.f_aq_strength == 0 )
339+ {
340+ memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
341+ memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
342+ if( h->frames.b_have_lowres )
343+ for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
344+ frame->i_inv_qscale_factor[mb_xy] = 256;
345+ }
346+ /* Need variance data for weighted prediction */
347+ if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
348+ {
349+ for( int mb_y = 0; mb_y < height; mb_y++ )
350+ for( int mb_x = 0; mb_x < width; mb_x++ )
351+ x264_ac_energy_mb( h, mb_x, mb_y, frame );
352+ }
353+ else
354+ return;
355 }
356+ /* Actual adaptive quantization */
357 else
358- strength = h->param.rc.f_aq_strength * 1.0397f;
359-
360- for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
361- for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
362+ {
363+ if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
364 {
365- float qp_adj;
366- if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
367- {
368- qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
369- qp_adj = strength * (qp_adj - avg_adj);
370- }
371- else
372+ float avg_adj_pow2 = 0.f;
373+ for( int mb_y = 0; mb_y < height; mb_y++ )
374+ for( int mb_x = 0; mb_x < width; mb_x++ )
375+ {
376+ uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
377+ float qp_adj = powf( energy + 1, 0.125f );
378+ frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
379+ avg_adj += qp_adj;
380+ avg_adj_pow2 += qp_adj * qp_adj;
381+ }
382+ avg_adj /= h->mb.i_mb_count;
383+ avg_adj_pow2 /= h->mb.i_mb_count;
384+ strength = h->param.rc.f_aq_strength * avg_adj;
385+ avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
386+ }
387+ else
388+ strength = h->param.rc.f_aq_strength * 1.0397f;
389+
390+ for( int mb_y = 0; mb_y < height; mb_y++ )
391+ for( int mb_x = 0; mb_x < width; mb_x++ )
392 {
393- uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
394- qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
395+ float qp_adj;
396+ if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
397+ {
398+ qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
399+ qp_adj = strength * (qp_adj - avg_adj);
400+ }
401+ else
402+ {
403+ uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
404+ qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
405+ }
406+ frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
407+ frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
408+ if( h->frames.b_have_lowres )
409+ frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
410 }
411- frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
412- frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
413- if( h->frames.b_have_lowres )
414- frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
415- }
416+ }
417+
418+ /* Remove mean from SSD calculation */
419+ for( int i = 0; i < 3; i++ )
420+ {
421+ uint64_t ssd = frame->i_pixel_ssd[i];
422+ uint64_t sum = frame->i_pixel_sum[i];
423+ int w = width*16>>!!i;
424+ int h = height*16>>!!i;
425+ frame->i_pixel_ssd[i] = ssd - (sum * sum + w * h / 2) / (w * h);
426+ }
427 }
428
429 int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
430diff --git a/encoder/slicetype.c b/encoder/slicetype.c
431index 9352367..e454e12 100644
432--- a/encoder/slicetype.c
433+++ b/encoder/slicetype.c
434@@ -67,25 +67,6 @@ static void x264_weight_get_h264( unsigned int weight_nonh264, int offset, x264_
435 w->i_scale = X264_MIN( w->i_scale, 127 );
436 }
437
438-void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame )
439-{
440- uint32_t sad = 0;
441- uint64_t ssd = 0;
442- uint8_t *p = frame->plane[0];
443- int stride = frame->i_stride[0];
444- int width = frame->i_width[0];
445- int height = frame->i_lines[0];
446- for( int y = 0; y < height>>4; y++, p += stride*16 )
447- for( int x = 0; x < width; x += 16 )
448- {
449- uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride );
450- sad += (uint32_t)res;
451- ssd += res >> 32;
452- }
453- frame->i_pixel_sum = sad;
454- frame->i_pixel_ssd = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
455-}
456-
457 static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, uint8_t *dest )
458 {
459 int ref0_distance = fenc->i_frame - ref->i_frame - 1;
460@@ -167,10 +148,10 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
461 int found;
462 x264_weight_t *weights = fenc->weight[0];
463
464- fenc_var = round( sqrt( fenc->i_pixel_ssd ) );
465- ref_var = round( sqrt( ref->i_pixel_ssd ) );
466- fenc_mean = (float)fenc->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
467- ref_mean = (float) ref->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
468+ fenc_var = round( sqrt( fenc->i_pixel_ssd[0] ) );
469+ ref_var = round( sqrt( ref->i_pixel_ssd[0] ) );
470+ fenc_mean = (float)fenc->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
471+ ref_mean = (float) ref->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
472
473 //early termination
474 if( fabs( ref_mean - fenc_mean ) < 0.5 && fabs( 1 - fenc_var / ref_var ) < epsilon )
475@@ -534,8 +515,8 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
476 do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
477 if( do_search[0] )
478 {
479- if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART
480- || h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
481+ if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ||
482+ h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
483 {
484 x264_emms();
485 x264_weights_analyse( h, frames[b], frames[p0], 1 );
486--
4871.7.0.4
488
489
490From 0bf2d9e3e55fa6b1cda4ca2b1066c3034c575225 Mon Sep 17 00:00:00 2001
491From: Jason Garrett-Glaser <darkshikari@gmail.com>
492Date: Thu, 27 May 2010 10:42:15 -0700
493Subject: [PATCH 05/10] Add fast skip in lookahead motion search
494 Helps speed very significantly on motionless blocks.
495
496---
497 encoder/slicetype.c | 16 +++++++++++++++-
498 1 files changed, 15 insertions(+), 1 deletions(-)
499
500diff --git a/encoder/slicetype.c b/encoder/slicetype.c
501index e454e12..d7cfe5c 100644
502--- a/encoder/slicetype.c
503+++ b/encoder/slicetype.c
504@@ -379,11 +379,25 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
505 CP32( m[l].mvp, mvc[0] );
506 else
507 x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );
508- x264_me_search( h, &m[l], mvc, i_mvc );
509
510+ /* Fast skip for cases of near-zero residual. Shortcut: don't bother except in the mv0 case,
511+ * since anything else is likely to have enough residual to not trigger the skip. */
512+ if( !M32( m[l].mvp ) )
513+ {
514+ m[l].cost = h->pixf.mbcmp[PIXEL_8x8]( m[l].p_fenc[0], FENC_STRIDE, m[l].p_fref[0], m[l].i_stride[0] );
515+ if( m[l].cost < 64 )
516+ {
517+ M32( m[l].mv ) = 0;
518+ goto skip_motionest;
519+ }
520+ }
521+
522+ x264_me_search( h, &m[l], mvc, i_mvc );
523 m[l].cost -= 2; // remove mvcost from skip mbs
524 if( M32( m[l].mv ) )
525 m[l].cost += 5;
526+
527+skip_motionest:
528 CP32( fenc_mvs[l], m[l].mv );
529 *fenc_costs[l] = m[l].cost;
530 }
531--
5321.7.0.4
533
534
535From f6abca2c4c0e582d522e135773b88f1ab3d459d2 Mon Sep 17 00:00:00 2001
536From: Jason Garrett-Glaser <darkshikari@gmail.com>
537Date: Thu, 27 May 2010 14:27:32 -0700
538Subject: [PATCH 06/10] x86 assembly code for NAL escaping
539 Up to ~10x faster than C depending on CPU.
540 Helps the most at very high bitrates (e.g. lossless).
541 Also make the C code faster and simpler.
542
543---
544 Makefile | 4 +-
545 common/bitstream.c | 92 ++++++++++++++
546 common/bitstream.h | 299 ++++++++++++++++++++++++++++++++++++++++++++
547 common/bs.h | 291 ------------------------------------------
548 common/common.c | 54 --------
549 common/common.h | 5 +-
550 common/x86/bitstream-a.asm | 112 +++++++++++++++++
551 common/x86/deblock-a.asm | 1 +
552 encoder/encoder.c | 3 +-
553 tools/checkasm.c | 52 ++++++++-
554 10 files changed, 561 insertions(+), 352 deletions(-)
555 create mode 100644 common/bitstream.c
556 create mode 100644 common/bitstream.h
557 delete mode 100644 common/bs.h
558 create mode 100644 common/x86/bitstream-a.asm
559
560diff --git a/Makefile b/Makefile
561index 0b43a3e..519e181 100644
562--- a/Makefile
563+++ b/Makefile
564@@ -8,7 +8,7 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
565 common/frame.c common/dct.c common/cpu.c common/cabac.c \
566 common/common.c common/mdate.c common/rectangle.c \
567 common/set.c common/quant.c common/deblock.c common/vlc.c \
568- common/mvpred.c \
569+ common/mvpred.c common/bitstream.c \
570 encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
571 encoder/set.c encoder/macroblock.c encoder/cabac.c \
572 encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
573@@ -52,7 +52,7 @@ endif
574 ifneq ($(AS),)
575 X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
576 mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
577- cpu-a.asm dct-32.asm
578+ cpu-a.asm dct-32.asm bitstream-a.asm
579 X86SRC = $(X86SRC0:%=common/x86/%)
580
581 ifeq ($(ARCH),X86)
582diff --git a/common/bitstream.c b/common/bitstream.c
583new file mode 100644
584index 0000000..0aaac21
585--- /dev/null
586+++ b/common/bitstream.c
587@@ -0,0 +1,92 @@
588+/*****************************************************************************
589+ * bitstream.c: h264 encoder library
590+ *****************************************************************************
591+ * Copyright (C) 2010 x264 project
592+ *
593+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
594+ * Jason Garrett-Glaser <darkshikari@gmail.com>
595+ *
596+ * This program is free software; you can redistribute it and/or modify
597+ * it under the terms of the GNU General Public License as published by
598+ * the Free Software Foundation; either version 2 of the License, or
599+ * (at your option) any later version.
600+ *
601+ * This program is distributed in the hope that it will be useful,
602+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
603+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
604+ * GNU General Public License for more details.
605+ *
606+ * You should have received a copy of the GNU General Public License
607+ * along with this program; if not, write to the Free Software
608+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
609+ *****************************************************************************/
610+
611+#include "common.h"
612+
613+static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
614+{
615+ if( src < end ) *dst++ = *src++;
616+ if( src < end ) *dst++ = *src++;
617+ while( src < end )
618+ {
619+ if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
620+ *dst++ = 0x03;
621+ *dst++ = *src++;
622+ }
623+ return dst;
624+}
625+
626+#ifdef HAVE_MMX
627+uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
628+uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
629+#endif
630+
631+/****************************************************************************
632+ * x264_nal_encode:
633+ ****************************************************************************/
634+int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode )
635+{
636+ uint8_t *src = nal->p_payload;
637+ uint8_t *end = nal->p_payload + nal->i_payload;
638+ uint8_t *orig_dst = dst;
639+
640+ if( h->param.b_annexb )
641+ {
642+ if( b_long_startcode )
643+ *dst++ = 0x00;
644+ *dst++ = 0x00;
645+ *dst++ = 0x00;
646+ *dst++ = 0x01;
647+ }
648+ else /* save room for size later */
649+ dst += 4;
650+
651+ /* nal header */
652+ *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
653+
654+ dst = h->bsf.nal_escape( dst, src, end );
655+ int size = (dst - orig_dst) - 4;
656+
657+ /* Write the size header for mp4/etc */
658+ if( !h->param.b_annexb )
659+ {
660+ /* Size doesn't include the size of the header we're writing now. */
661+ orig_dst[0] = size>>24;
662+ orig_dst[1] = size>>16;
663+ orig_dst[2] = size>> 8;
664+ orig_dst[3] = size>> 0;
665+ }
666+
667+ return size+4;
668+}
669+
670+void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
671+{
672+ pf->nal_escape = x264_nal_escape_c;
673+#ifdef HAVE_MMX
674+ if( cpu&X264_CPU_MMXEXT )
675+ pf->nal_escape = x264_nal_escape_mmxext;
676+ if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
677+ pf->nal_escape = x264_nal_escape_sse2;
678+#endif
679+}
680diff --git a/common/bitstream.h b/common/bitstream.h
681new file mode 100644
682index 0000000..d018c7d
683--- /dev/null
684+++ b/common/bitstream.h
685@@ -0,0 +1,299 @@
686+/*****************************************************************************
687+ * bitstream.h: h264 encoder library
688+ *****************************************************************************
689+ * Copyright (C) 2003-2008 x264 project
690+ *
691+ * Authors: Loren Merritt <lorenm@u.washington.edu>
692+ * Jason Garrett-Glaser <darkshikari@gmail.com>
693+ * Laurent Aimar <fenrir@via.ecp.fr>
694+ *
695+ * This program is free software; you can redistribute it and/or modify
696+ * it under the terms of the GNU General Public License as published by
697+ * the Free Software Foundation; either version 2 of the License, or
698+ * (at your option) any later version.
699+ *
700+ * This program is distributed in the hope that it will be useful,
701+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
702+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
703+ * GNU General Public License for more details.
704+ *
705+ * You should have received a copy of the GNU General Public License
706+ * along with this program; if not, write to the Free Software
707+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
708+ *****************************************************************************/
709+
710+#ifndef X264_BS_H
711+#define X264_BS_H
712+
713+typedef struct
714+{
715+ uint8_t i_bits;
716+ uint8_t i_size;
717+} vlc_t;
718+
719+typedef struct
720+{
721+ uint16_t i_bits;
722+ uint8_t i_size;
723+ /* Next level table to use */
724+ uint8_t i_next;
725+} vlc_large_t;
726+
727+typedef struct bs_s
728+{
729+ uint8_t *p_start;
730+ uint8_t *p;
731+ uint8_t *p_end;
732+
733+ intptr_t cur_bits;
734+ int i_left; /* i_count number of available bits */
735+ int i_bits_encoded; /* RD only */
736+} bs_t;
737+
738+typedef struct
739+{
740+ int last;
741+ int16_t level[16];
742+ uint8_t run[16];
743+} x264_run_level_t;
744+
745+extern const vlc_t x264_coeff0_token[5];
746+extern const vlc_t x264_coeff_token[5][16][4];
747+extern const vlc_t x264_total_zeros[15][16];
748+extern const vlc_t x264_total_zeros_dc[3][4];
749+extern const vlc_t x264_run_before[7][16];
750+
751+typedef struct
752+{
753+ uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end );
754+} x264_bitstream_function_t;
755+
756+int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode );
757+void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
758+
759+/* A larger level table size theoretically could help a bit at extremely
760+ * high bitrates, but the cost in cache is usually too high for it to be
761+ * useful.
762+ * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
763+ * FIXME: Do further testing? */
764+#define LEVEL_TABLE_SIZE 128
765+extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
766+
767+static inline void bs_init( bs_t *s, void *p_data, int i_data )
768+{
769+ int offset = ((intptr_t)p_data & 3);
770+ s->p = s->p_start = (uint8_t*)p_data - offset;
771+ s->p_end = (uint8_t*)p_data + i_data;
772+ s->i_left = (WORD_SIZE - offset)*8;
773+ s->cur_bits = endian_fix32( M32(s->p) );
774+ s->cur_bits >>= (4-offset)*8;
775+}
776+static inline int bs_pos( bs_t *s )
777+{
778+ return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
779+}
780+
781+/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
782+static inline void bs_flush( bs_t *s )
783+{
784+ M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
785+ s->p += WORD_SIZE - s->i_left / 8;
786+ s->i_left = WORD_SIZE*8;
787+}
788+/* The inverse of bs_flush: prepare the bitstream to be written to again. */
789+static inline void bs_realign( bs_t *s )
790+{
791+ int offset = ((intptr_t)s->p & 3);
792+ if( offset )
793+ {
794+ s->p = (uint8_t*)s->p - offset;
795+ s->i_left = (WORD_SIZE - offset)*8;
796+ s->cur_bits = endian_fix32( M32(s->p) );
797+ s->cur_bits >>= (4-offset)*8;
798+ }
799+}
800+
801+static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
802+{
803+ if( WORD_SIZE == 8 )
804+ {
805+ s->cur_bits = (s->cur_bits << i_count) | i_bits;
806+ s->i_left -= i_count;
807+ if( s->i_left <= 32 )
808+ {
809+#ifdef WORDS_BIGENDIAN
810+ M32( s->p ) = s->cur_bits >> (32 - s->i_left);
811+#else
812+ M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
813+#endif
814+ s->i_left += 32;
815+ s->p += 4;
816+ }
817+ }
818+ else
819+ {
820+ if( i_count < s->i_left )
821+ {
822+ s->cur_bits = (s->cur_bits << i_count) | i_bits;
823+ s->i_left -= i_count;
824+ }
825+ else
826+ {
827+ i_count -= s->i_left;
828+ s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
829+ M32( s->p ) = endian_fix( s->cur_bits );
830+ s->p += 4;
831+ s->cur_bits = i_bits;
832+ s->i_left = 32 - i_count;
833+ }
834+ }
835+}
836+
837+/* Special case to eliminate branch in normal bs_write. */
838+/* Golomb never writes an even-size code, so this is only used in slice headers. */
839+static inline void bs_write32( bs_t *s, uint32_t i_bits )
840+{
841+ bs_write( s, 16, i_bits >> 16 );
842+ bs_write( s, 16, i_bits );
843+}
844+
845+static inline void bs_write1( bs_t *s, uint32_t i_bit )
846+{
847+ s->cur_bits <<= 1;
848+ s->cur_bits |= i_bit;
849+ s->i_left--;
850+ if( s->i_left == WORD_SIZE*8-32 )
851+ {
852+ M32( s->p ) = endian_fix32( s->cur_bits );
853+ s->p += 4;
854+ s->i_left = WORD_SIZE*8;
855+ }
856+}
857+
858+static inline void bs_align_0( bs_t *s )
859+{
860+ bs_write( s, s->i_left&7, 0 );
861+ bs_flush( s );
862+}
863+static inline void bs_align_1( bs_t *s )
864+{
865+ bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
866+ bs_flush( s );
867+}
868+static inline void bs_align_10( bs_t *s )
869+{
870+ if( s->i_left&7 )
871+ bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
872+}
873+
874+/* golomb functions */
875+
876+static const uint8_t x264_ue_size_tab[256] =
877+{
878+ 1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
879+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
880+ 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
881+ 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
882+ 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
883+ 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
884+ 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
885+ 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
886+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
887+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
888+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
889+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
890+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
891+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
892+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
893+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
894+};
895+
896+static inline void bs_write_ue_big( bs_t *s, unsigned int val )
897+{
898+ int size = 0;
899+ int tmp = ++val;
900+ if( tmp >= 0x10000 )
901+ {
902+ size = 32;
903+ tmp >>= 16;
904+ }
905+ if( tmp >= 0x100 )
906+ {
907+ size += 16;
908+ tmp >>= 8;
909+ }
910+ size += x264_ue_size_tab[tmp];
911+ bs_write( s, size>>1, 0 );
912+ bs_write( s, (size>>1)+1, val );
913+}
914+
915+/* Only works on values under 255. */
916+static inline void bs_write_ue( bs_t *s, int val )
917+{
918+ bs_write( s, x264_ue_size_tab[val+1], val+1 );
919+}
920+
921+static inline void bs_write_se( bs_t *s, int val )
922+{
923+ int size = 0;
924+ /* Faster than (val <= 0 ? -val*2+1 : val*2) */
925+ /* 4 instructions on x86, 3 on ARM */
926+ int tmp = 1 - val*2;
927+ if( tmp < 0 ) tmp = val*2;
928+ val = tmp;
929+
930+ if( tmp >= 0x100 )
931+ {
932+ size = 16;
933+ tmp >>= 8;
934+ }
935+ size += x264_ue_size_tab[tmp];
936+ bs_write( s, size, val );
937+}
938+
939+static inline void bs_write_te( bs_t *s, int x, int val )
940+{
941+ if( x == 1 )
942+ bs_write1( s, 1^val );
943+ else //if( x > 1 )
944+ bs_write_ue( s, val );
945+}
946+
947+static inline void bs_rbsp_trailing( bs_t *s )
948+{
949+ bs_write1( s, 1 );
950+ bs_write( s, s->i_left&7, 0 );
951+}
952+
953+static ALWAYS_INLINE int bs_size_ue( unsigned int val )
954+{
955+ return x264_ue_size_tab[val+1];
956+}
957+
958+static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
959+{
960+ if( val < 255 )
961+ return x264_ue_size_tab[val+1];
962+ else
963+ return x264_ue_size_tab[(val+1)>>8] + 16;
964+}
965+
966+static ALWAYS_INLINE int bs_size_se( int val )
967+{
968+ int tmp = 1 - val*2;
969+ if( tmp < 0 ) tmp = val*2;
970+ if( tmp < 256 )
971+ return x264_ue_size_tab[tmp];
972+ else
973+ return x264_ue_size_tab[tmp>>8]+16;
974+}
975+
976+static ALWAYS_INLINE int bs_size_te( int x, int val )
977+{
978+ if( x == 1 )
979+ return 1;
980+ else //if( x > 1 )
981+ return x264_ue_size_tab[val+1];
982+}
983+
984+#endif
985diff --git a/common/bs.h b/common/bs.h
986deleted file mode 100644
987index 343a3c9..0000000
988--- a/common/bs.h
989+++ /dev/null
990@@ -1,291 +0,0 @@
991-/*****************************************************************************
992- * bs.h :
993- *****************************************************************************
994- * Copyright (C) 2003-2008 x264 project
995- *
996- * Authors: Loren Merritt <lorenm@u.washington.edu>
997- * Jason Garrett-Glaser <darkshikari@gmail.com>
998- * Laurent Aimar <fenrir@via.ecp.fr>
999- *
1000- * This program is free software; you can redistribute it and/or modify
1001- * it under the terms of the GNU General Public License as published by
1002- * the Free Software Foundation; either version 2 of the License, or
1003- * (at your option) any later version.
1004- *
1005- * This program is distributed in the hope that it will be useful,
1006- * but WITHOUT ANY WARRANTY; without even the implied warranty of
1007- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1008- * GNU General Public License for more details.
1009- *
1010- * You should have received a copy of the GNU General Public License
1011- * along with this program; if not, write to the Free Software
1012- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
1013- *****************************************************************************/
1014-
1015-#ifndef X264_BS_H
1016-#define X264_BS_H
1017-
1018-typedef struct
1019-{
1020- uint8_t i_bits;
1021- uint8_t i_size;
1022-} vlc_t;
1023-
1024-typedef struct
1025-{
1026- uint16_t i_bits;
1027- uint8_t i_size;
1028- /* Next level table to use */
1029- uint8_t i_next;
1030-} vlc_large_t;
1031-
1032-typedef struct bs_s
1033-{
1034- uint8_t *p_start;
1035- uint8_t *p;
1036- uint8_t *p_end;
1037-
1038- intptr_t cur_bits;
1039- int i_left; /* i_count number of available bits */
1040- int i_bits_encoded; /* RD only */
1041-} bs_t;
1042-
1043-typedef struct
1044-{
1045- int last;
1046- int16_t level[16];
1047- uint8_t run[16];
1048-} x264_run_level_t;
1049-
1050-extern const vlc_t x264_coeff0_token[5];
1051-extern const vlc_t x264_coeff_token[5][16][4];
1052-extern const vlc_t x264_total_zeros[15][16];
1053-extern const vlc_t x264_total_zeros_dc[3][4];
1054-extern const vlc_t x264_run_before[7][16];
1055-
1056-/* A larger level table size theoretically could help a bit at extremely
1057- * high bitrates, but the cost in cache is usually too high for it to be
1058- * useful.
1059- * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
1060- * FIXME: Do further testing? */
1061-#define LEVEL_TABLE_SIZE 128
1062-extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
1063-
1064-static inline void bs_init( bs_t *s, void *p_data, int i_data )
1065-{
1066- int offset = ((intptr_t)p_data & 3);
1067- s->p = s->p_start = (uint8_t*)p_data - offset;
1068- s->p_end = (uint8_t*)p_data + i_data;
1069- s->i_left = (WORD_SIZE - offset)*8;
1070- s->cur_bits = endian_fix32( M32(s->p) );
1071- s->cur_bits >>= (4-offset)*8;
1072-}
1073-static inline int bs_pos( bs_t *s )
1074-{
1075- return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
1076-}
1077-
1078-/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
1079-static inline void bs_flush( bs_t *s )
1080-{
1081- M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
1082- s->p += WORD_SIZE - s->i_left / 8;
1083- s->i_left = WORD_SIZE*8;
1084-}
1085-/* The inverse of bs_flush: prepare the bitstream to be written to again. */
1086-static inline void bs_realign( bs_t *s )
1087-{
1088- int offset = ((intptr_t)s->p & 3);
1089- if( offset )
1090- {
1091- s->p = (uint8_t*)s->p - offset;
1092- s->i_left = (WORD_SIZE - offset)*8;
1093- s->cur_bits = endian_fix32( M32(s->p) );
1094- s->cur_bits >>= (4-offset)*8;
1095- }
1096-}
1097-
1098-static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
1099-{
1100- if( WORD_SIZE == 8 )
1101- {
1102- s->cur_bits = (s->cur_bits << i_count) | i_bits;
1103- s->i_left -= i_count;
1104- if( s->i_left <= 32 )
1105- {
1106-#ifdef WORDS_BIGENDIAN
1107- M32( s->p ) = s->cur_bits >> (32 - s->i_left);
1108-#else
1109- M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
1110-#endif
1111- s->i_left += 32;
1112- s->p += 4;
1113- }
1114- }
1115- else
1116- {
1117- if( i_count < s->i_left )
1118- {
1119- s->cur_bits = (s->cur_bits << i_count) | i_bits;
1120- s->i_left -= i_count;
1121- }
1122- else
1123- {
1124- i_count -= s->i_left;
1125- s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
1126- M32( s->p ) = endian_fix( s->cur_bits );
1127- s->p += 4;
1128- s->cur_bits = i_bits;
1129- s->i_left = 32 - i_count;
1130- }
1131- }
1132-}
1133-
1134-/* Special case to eliminate branch in normal bs_write. */
1135-/* Golomb never writes an even-size code, so this is only used in slice headers. */
1136-static inline void bs_write32( bs_t *s, uint32_t i_bits )
1137-{
1138- bs_write( s, 16, i_bits >> 16 );
1139- bs_write( s, 16, i_bits );
1140-}
1141-
1142-static inline void bs_write1( bs_t *s, uint32_t i_bit )
1143-{
1144- s->cur_bits <<= 1;
1145- s->cur_bits |= i_bit;
1146- s->i_left--;
1147- if( s->i_left == WORD_SIZE*8-32 )
1148- {
1149- M32( s->p ) = endian_fix32( s->cur_bits );
1150- s->p += 4;
1151- s->i_left = WORD_SIZE*8;
1152- }
1153-}
1154-
1155-static inline void bs_align_0( bs_t *s )
1156-{
1157- bs_write( s, s->i_left&7, 0 );
1158- bs_flush( s );
1159-}
1160-static inline void bs_align_1( bs_t *s )
1161-{
1162- bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
1163- bs_flush( s );
1164-}
1165-static inline void bs_align_10( bs_t *s )
1166-{
1167- if( s->i_left&7 )
1168- bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
1169-}
1170-
1171-/* golomb functions */
1172-
1173-static const uint8_t x264_ue_size_tab[256] =
1174-{
1175- 1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
1176- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
1177- 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
1178- 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
1179- 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1180- 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1181- 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1182- 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
1183- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1184- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1185- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1186- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1187- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1188- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1189- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1190- 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1191-};
1192-
1193-static inline void bs_write_ue_big( bs_t *s, unsigned int val )
1194-{
1195- int size = 0;
1196- int tmp = ++val;
1197- if( tmp >= 0x10000 )
1198- {
1199- size = 32;
1200- tmp >>= 16;
1201- }
1202- if( tmp >= 0x100 )
1203- {
1204- size += 16;
1205- tmp >>= 8;
1206- }
1207- size += x264_ue_size_tab[tmp];
1208- bs_write( s, size>>1, 0 );
1209- bs_write( s, (size>>1)+1, val );
1210-}
1211-
1212-/* Only works on values under 255. */
1213-static inline void bs_write_ue( bs_t *s, int val )
1214-{
1215- bs_write( s, x264_ue_size_tab[val+1], val+1 );
1216-}
1217-
1218-static inline void bs_write_se( bs_t *s, int val )
1219-{
1220- int size = 0;
1221- /* Faster than (val <= 0 ? -val*2+1 : val*2) */
1222- /* 4 instructions on x86, 3 on ARM */
1223- int tmp = 1 - val*2;
1224- if( tmp < 0 ) tmp = val*2;
1225- val = tmp;
1226-
1227- if( tmp >= 0x100 )
1228- {
1229- size = 16;
1230- tmp >>= 8;
1231- }
1232- size += x264_ue_size_tab[tmp];
1233- bs_write( s, size, val );
1234-}
1235-
1236-static inline void bs_write_te( bs_t *s, int x, int val )
1237-{
1238- if( x == 1 )
1239- bs_write1( s, 1^val );
1240- else //if( x > 1 )
1241- bs_write_ue( s, val );
1242-}
1243-
1244-static inline void bs_rbsp_trailing( bs_t *s )
1245-{
1246- bs_write1( s, 1 );
1247- bs_write( s, s->i_left&7, 0 );
1248-}
1249-
1250-static ALWAYS_INLINE int bs_size_ue( unsigned int val )
1251-{
1252- return x264_ue_size_tab[val+1];
1253-}
1254-
1255-static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
1256-{
1257- if( val < 255 )
1258- return x264_ue_size_tab[val+1];
1259- else
1260- return x264_ue_size_tab[(val+1)>>8] + 16;
1261-}
1262-
1263-static ALWAYS_INLINE int bs_size_se( int val )
1264-{
1265- int tmp = 1 - val*2;
1266- if( tmp < 0 ) tmp = val*2;
1267- if( tmp < 256 )
1268- return x264_ue_size_tab[tmp];
1269- else
1270- return x264_ue_size_tab[tmp>>8]+16;
1271-}
1272-
1273-static ALWAYS_INLINE int bs_size_te( int x, int val )
1274-{
1275- if( x == 1 )
1276- return 1;
1277- else //if( x > 1 )
1278- return x264_ue_size_tab[val+1];
1279-}
1280-
1281-#endif
1282diff --git a/common/common.c b/common/common.c
1283index fccf2b0..2458f65 100644
1284--- a/common/common.c
1285+++ b/common/common.c
1286@@ -1027,60 +1027,6 @@ void x264_picture_clean( x264_picture_t *pic )
1287 }
1288
1289 /****************************************************************************
1290- * x264_nal_encode:
1291- ****************************************************************************/
1292-int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode )
1293-{
1294- uint8_t *src = nal->p_payload;
1295- uint8_t *end = nal->p_payload + nal->i_payload;
1296- uint8_t *orig_dst = dst;
1297- int i_count = 0, size;
1298-
1299- if( b_annexb )
1300- {
1301- if( b_long_startcode )
1302- *dst++ = 0x00;
1303- *dst++ = 0x00;
1304- *dst++ = 0x00;
1305- *dst++ = 0x01;
1306- }
1307- else /* save room for size later */
1308- dst += 4;
1309-
1310- /* nal header */
1311- *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
1312-
1313- while( src < end )
1314- {
1315- if( i_count == 2 && *src <= 0x03 )
1316- {
1317- *dst++ = 0x03;
1318- i_count = 0;
1319- }
1320- if( *src == 0 )
1321- i_count++;
1322- else
1323- i_count = 0;
1324- *dst++ = *src++;
1325- }
1326- size = (dst - orig_dst) - 4;
1327-
1328- /* Write the size header for mp4/etc */
1329- if( !b_annexb )
1330- {
1331- /* Size doesn't include the size of the header we're writing now. */
1332- orig_dst[0] = size>>24;
1333- orig_dst[1] = size>>16;
1334- orig_dst[2] = size>> 8;
1335- orig_dst[3] = size>> 0;
1336- }
1337-
1338- return size+4;
1339-}
1340-
1341-
1342-
1343-/****************************************************************************
1344 * x264_malloc:
1345 ****************************************************************************/
1346 void *x264_malloc( int i_size )
1347diff --git a/common/common.h b/common/common.h
1348index 539ea65..93712fe 100644
1349--- a/common/common.h
1350+++ b/common/common.h
1351@@ -137,7 +137,7 @@ static const int x264_scan8[16+2*4+3] =
1352 */
1353
1354 #include "x264.h"
1355-#include "bs.h"
1356+#include "bitstream.h"
1357 #include "set.h"
1358 #include "predict.h"
1359 #include "pixel.h"
1360@@ -166,8 +166,6 @@ int64_t x264_mdate( void );
1361 * the encoding options */
1362 char *x264_param2string( x264_param_t *p, int b_res );
1363
1364-int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode );
1365-
1366 /* log */
1367 void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
1368
1369@@ -796,6 +794,7 @@ struct x264_t
1370 x264_zigzag_function_t zigzagf;
1371 x264_quant_function_t quantf;
1372 x264_deblock_function_t loopf;
1373+ x264_bitstream_function_t bsf;
1374
1375 #ifdef HAVE_VISUALIZE
1376 struct visualize_t *visualize;
1377diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
1378new file mode 100644
1379index 0000000..1fb4cea
1380--- /dev/null
1381+++ b/common/x86/bitstream-a.asm
1382@@ -0,0 +1,112 @@
1383+;*****************************************************************************
1384+;* bitstream-a.asm: h264 encoder library
1385+;*****************************************************************************
1386+;* Copyright (C) 2010 x264 project
1387+;*
1388+;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
1389+;*
1390+;* This program is free software; you can redistribute it and/or modify
1391+;* it under the terms of the GNU General Public License as published by
1392+;* the Free Software Foundation; either version 2 of the License, or
1393+;* (at your option) any later version.
1394+;*
1395+;* This program is distributed in the hope that it will be useful,
1396+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
1397+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1398+;* GNU General Public License for more details.
1399+;*
1400+;* You should have received a copy of the GNU General Public License
1401+;* along with this program; if not, write to the Free Software
1402+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
1403+;*****************************************************************************
1404+
1405+%include "x86inc.asm"
1406+%include "x86util.asm"
1407+
1408+SECTION .text
1409+
1410+;-----------------------------------------------------------------------------
1411+; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
1412+;-----------------------------------------------------------------------------
1413+
1414+%macro NAL_LOOP 2
1415+ALIGN 16
1416+%1:
1417+ mova m0, [r1+r2]
1418+ mova m1, m0
1419+%if mmsize == 8
1420+ psrlq m0, 8
1421+%else
1422+ psrldq m0, 1
1423+%endif
1424+ %2 [r0+r1], m1
1425+ por m1, m0
1426+ pcmpeqb m1, m2
1427+ pmovmskb r3d, m1
1428+ test r3d, r3d
1429+ jnz .escape
1430+ add r1, mmsize
1431+ jl %1
1432+%endmacro
1433+
1434+%macro NAL_ESCAPE 1
1435+
1436+cglobal nal_escape_%1, 3,5
1437+ pxor m2, m2
1438+ sub r1, r2 ; r1 = offset of current src pointer from end of src
1439+ sub r0, r1 ; r0 = projected end of dst, assuming no more escapes
1440+
1441+ mov r3w, [r1+r2]
1442+ mov [r0+r1], r3w
1443+ add r1, 2
1444+ jge .ret
1445+
1446+ ; Start off by jumping into the escape loop in
1447+ ; case there's an escape at the start.
1448+ ; And do a few more in scalar until src is aligned again.
1449+ lea r4d, [r1+r2]
1450+ or r4d, -mmsize
1451+ neg r4d
1452+ jmp .escapeloop
1453+
1454+ NAL_LOOP .loop_aligned, mova
1455+%if mmsize==16
1456+ NAL_LOOP .loop_unaligned, movu
1457+%endif
1458+
1459+.ret:
1460+ movifnidn rax, r0
1461+ RET
1462+ALIGN 16
1463+.escape:
1464+ mov r4d, mmsize
1465+.escapeloop:
1466+ mov r3b, [r1+r2]
1467+ cmp r3b, 3
1468+ jna .escape_check
1469+.copy:
1470+ mov [r0+r1], r3b
1471+ inc r1
1472+ jge .ret
1473+ dec r4d
1474+ jg .escapeloop
1475+ cmp byte [r1+r2-1], 0 ; Don't go back to the main loop until we're out of a zero-run.
1476+ jz .escape
1477+%if mmsize==16
1478+ lea r4d, [r0+r1]
1479+ test r4d, mmsize-1
1480+ jnz .loop_unaligned
1481+%endif
1482+ jmp .loop_aligned
1483+.escape_check:
1484+ cmp word [r0+r1-2], 0
1485+ jnz .copy
1486+ mov byte [r0+r1], 3
1487+ inc r0
1488+ jmp .copy
1489+%endmacro
1490+
1491+INIT_MMX
1492+NAL_ESCAPE mmxext
1493+INIT_XMM
1494+NAL_ESCAPE sse2
1495diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
1496index aedd688..3a31e26 100644
1497--- a/common/x86/deblock-a.asm
1498+++ b/common/x86/deblock-a.asm
1499@@ -4,6 +4,7 @@
1500 ;* Copyright (C) 2005-2008 x264 project
1501 ;*
1502 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
1503+;* Jason Garrett-Glaser <darkshikari@gmail.com>
1504 ;*
1505 ;* This program is free software; you can redistribute it and/or modify
1506 ;* it under the terms of the GNU General Public License as published by
1507diff --git a/encoder/encoder.c b/encoder/encoder.c
1508index 6e0dc54..32db82a 100644
1509--- a/encoder/encoder.c
1510+++ b/encoder/encoder.c
1511@@ -986,6 +986,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
1512 x264_mc_init( h->param.cpu, &h->mc );
1513 x264_quant_init( h, h->param.cpu, &h->quantf );
1514 x264_deblock_init( h->param.cpu, &h->loopf );
1515+ x264_bitstream_init( h->param.cpu, &h->bsf );
1516 x264_dct_init_weights();
1517
1518 mbcmp_init( h );
1519@@ -1272,7 +1273,7 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start )
1520 for( int i = start; i < h->out.i_nal; i++ )
1521 {
1522 int long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS;
1523- int size = x264_nal_encode( nal_buffer, &h->out.nal[i], h->param.b_annexb, long_startcode );
1524+ int size = x264_nal_encode( h, nal_buffer, &h->out.nal[i], long_startcode );
1525 h->out.nal[i].i_payload = size;
1526 h->out.nal[i].p_payload = nal_buffer;
1527 nal_buffer += size;
1528diff --git a/tools/checkasm.c b/tools/checkasm.c
1529index a0a9d54..ea6f209 100644
1530--- a/tools/checkasm.c
1531+++ b/tools/checkasm.c
1532@@ -1661,6 +1661,55 @@ static int check_cabac( int cpu_ref, int cpu_new )
1533 return ret;
1534 }
1535
1536+static int check_bitstream( int cpu_ref, int cpu_new )
1537+{
1538+ x264_bitstream_function_t bs_c;
1539+ x264_bitstream_function_t bs_ref;
1540+ x264_bitstream_function_t bs_a;
1541+
1542+ int ret = 0, ok = 1, used_asm = 0;
1543+
1544+ x264_bitstream_init( 0, &bs_c );
1545+ x264_bitstream_init( cpu_ref, &bs_ref );
1546+ x264_bitstream_init( cpu_new, &bs_a );
1547+ if( bs_a.nal_escape != bs_ref.nal_escape )
1548+ {
1549+ int size = 0x4000;
1550+ uint8_t *input = malloc(size+100);
1551+ uint8_t *output1 = malloc(size*2);
1552+ uint8_t *output2 = malloc(size*2);
1553+ used_asm = 1;
1554+ set_func_name( "nal_escape" );
1555+ for( int i = 0; i < 100; i++ )
1556+ {
1557+ /* Test corner-case sizes */
1558+ int test_size = i < 10 ? i+1 : rand() & 0x3fff;
1559+ for( int j = 0; j < test_size; j++ )
1560+ input[j] = (rand()&1) * rand();
1561+ uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
1562+ uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
1563+ int size_c = end_c-output1;
1564+ int size_a = end_a-output2;
1565+ if( size_c != size_a || memcmp( output1, output2, size_c ) )
1566+ {
1567+ fprintf( stderr, "nal_escape : [FAILED] %d %d\n", size_c, size_a );
1568+ ok = 0;
1569+ break;
1570+ }
1571+ }
1572+ for( int j = 0; j < size; j++ )
1573+ input[j] = rand();
1574+ call_c2( bs_c.nal_escape, output1, input, input+size );
1575+ call_a2( bs_a.nal_escape, output2, input, input+size );
1576+ free(input);
1577+ free(output1);
1578+ free(output2);
1579+ }
1580+ report( "nal escape:" );
1581+
1582+ return ret;
1583+}
1584+
1585 static int check_all_funcs( int cpu_ref, int cpu_new )
1586 {
1587 return check_pixel( cpu_ref, cpu_new )
1588@@ -1669,7 +1718,8 @@ static int check_all_funcs( int cpu_ref, int cpu_new )
1589 + check_intra( cpu_ref, cpu_new )
1590 + check_deblock( cpu_ref, cpu_new )
1591 + check_quant( cpu_ref, cpu_new )
1592- + check_cabac( cpu_ref, cpu_new );
1593+ + check_cabac( cpu_ref, cpu_new )
1594+ + check_bitstream( cpu_ref, cpu_new );
1595 }
1596
1597 static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
1598--
15991.7.0.4
1600
1601
1602From 790c0bcb4d96894969ab3dab6df670eafcbbcd85 Mon Sep 17 00:00:00 2001
1603From: Jason Garrett-Glaser <darkshikari@gmail.com>
1604Date: Fri, 28 May 2010 14:30:07 -0700
1605Subject: [PATCH 07/10] Re-enable i8x8 merged SATD
1606 Accidentally got disabled when intra_sad_x3 was added.
1607
1608---
1609 encoder/encoder.c | 1 +
1610 1 files changed, 1 insertions(+), 0 deletions(-)
1611
1612diff --git a/encoder/encoder.c b/encoder/encoder.c
1613index 32db82a..2f9e7f6 100644
1614--- a/encoder/encoder.c
1615+++ b/encoder/encoder.c
1616@@ -810,6 +810,7 @@ static void mbcmp_init( x264_t *h )
1617 memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
1618 h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
1619 h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
1620+ h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8;
1621 h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
1622 satd &= h->param.analyse.i_me_method == X264_ME_TESA;
1623 memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
1624--
16251.7.0.4
1626
1627
1628From 6e549ed124a0a84d77c51baa39984fb36ab49123 Mon Sep 17 00:00:00 2001
1629From: Jason Garrett-Glaser <darkshikari@gmail.com>
1630Date: Fri, 28 May 2010 14:27:22 -0700
1631Subject: [PATCH 08/10] Add API tool to apply arbitrary quantizer offsets
1632 The calling application can now pass a "map" of quantizer offsets to apply to each frame.
1633 An optional callback to free the map can also be included.
1634 This allows all kinds of flexible region-of-interest coding and similar.
1635
1636---
1637 common/common.c | 2 +-
1638 encoder/encoder.c | 7 +++++--
1639 encoder/ratecontrol.c | 36 +++++++++++++++++++++++++-----------
1640 encoder/ratecontrol.h | 4 ++--
1641 x264.h | 20 +++++++++++++++++++-
1642 5 files changed, 52 insertions(+), 17 deletions(-)
1643
1644diff --git a/common/common.c b/common/common.c
1645index 2458f65..48e1bbc 100644
1646--- a/common/common.c
1647+++ b/common/common.c
1648@@ -998,6 +998,7 @@ static void x264_log_default( void *p_unused, int i_level, const char *psz_fmt,
1649 ****************************************************************************/
1650 int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
1651 {
1652+ memset( pic, 0, sizeof( x264_picture_t ) );
1653 pic->i_type = X264_TYPE_AUTO;
1654 pic->i_qpplus1 = 0;
1655 pic->img.i_csp = i_csp;
1656@@ -1010,7 +1011,6 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
1657 pic->img.i_stride[0] = i_width;
1658 pic->img.i_stride[1] = i_width / 2;
1659 pic->img.i_stride[2] = i_width / 2;
1660- pic->param = NULL;
1661 pic->i_pic_struct = PIC_STRUCT_AUTO;
1662 return 0;
1663 }
1664diff --git a/encoder/encoder.c b/encoder/encoder.c
1665index 2f9e7f6..89107a3 100644
1666--- a/encoder/encoder.c
1667+++ b/encoder/encoder.c
1668@@ -2250,11 +2250,14 @@ int x264_encoder_encode( x264_t *h,
1669
1670 if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
1671 {
1672- if( x264_macroblock_tree_read( h, fenc ) )
1673+ if( x264_macroblock_tree_read( h, fenc, pic_in->prop.quant_offsets ) )
1674 return -1;
1675 }
1676 else
1677- x264_adaptive_quant_frame( h, fenc );
1678+ x264_adaptive_quant_frame( h, fenc, pic_in->prop.quant_offsets );
1679+
1680+ if( pic_in->prop.quant_offsets_free )
1681+ pic_in->prop.quant_offsets_free( pic_in->prop.quant_offsets );
1682
1683 if( h->frames.b_have_lowres )
1684 x264_frame_init_lowres( h, fenc );
1685diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
1686index bf0a400..d09de98 100644
1687--- a/encoder/ratecontrol.c
1688+++ b/encoder/ratecontrol.c
1689@@ -235,7 +235,7 @@ static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_
1690 return var;
1691 }
1692
1693-void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
1694+void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets )
1695 {
1696 /* constants chosen to result in approximately the same overall bitrate as without AQ.
1697 * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
1698@@ -256,11 +256,22 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
1699 /* Need to init it anyways for MB tree */
1700 if( h->param.rc.f_aq_strength == 0 )
1701 {
1702- memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
1703- memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
1704- if( h->frames.b_have_lowres )
1705+ if( quant_offsets )
1706+ {
1707 for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
1708- frame->i_inv_qscale_factor[mb_xy] = 256;
1709+ frame->f_qp_offset[mb_xy] = frame->f_qp_offset_aq[mb_xy] = quant_offsets[mb_xy];
1710+ if( h->frames.b_have_lowres )
1711+ for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
1712+ frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8( frame->f_qp_offset[mb_xy] );
1713+ }
1714+ else
1715+ {
1716+ memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
1717+ memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
1718+ if( h->frames.b_have_lowres )
1719+ for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
1720+ frame->i_inv_qscale_factor[mb_xy] = 256;
1721+ }
1722 }
1723 /* Need variance data for weighted prediction */
1724 if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
1725@@ -299,9 +310,10 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
1726 for( int mb_x = 0; mb_x < width; mb_x++ )
1727 {
1728 float qp_adj;
1729+ int mb_xy = mb_x + mb_y*h->mb.i_mb_stride;
1730 if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
1731 {
1732- qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
1733+ qp_adj = frame->f_qp_offset[mb_xy];
1734 qp_adj = strength * (qp_adj - avg_adj);
1735 }
1736 else
1737@@ -309,10 +321,12 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
1738 uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
1739 qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
1740 }
1741- frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
1742- frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
1743+ if( quant_offsets )
1744+ qp_adj += quant_offsets[mb_xy];
1745+ frame->f_qp_offset[mb_xy] =
1746+ frame->f_qp_offset_aq[mb_xy] = qp_adj;
1747 if( h->frames.b_have_lowres )
1748- frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
1749+ frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8(qp_adj);
1750 }
1751 }
1752
1753@@ -327,7 +341,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
1754 }
1755 }
1756
1757-int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
1758+int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets )
1759 {
1760 x264_ratecontrol_t *rc = h->rc;
1761 uint8_t i_type_actual = rc->entry[frame->i_frame].pict_type;
1762@@ -363,7 +377,7 @@ int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
1763 rc->qpbuf_pos--;
1764 }
1765 else
1766- x264_adaptive_quant_frame( h, frame );
1767+ x264_adaptive_quant_frame( h, frame, quant_offsets );
1768 return 0;
1769 fail:
1770 x264_log(h, X264_LOG_ERROR, "Incomplete MB-tree stats file.\n");
1771diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
1772index e052b2a..dd139eb 100644
1773--- a/encoder/ratecontrol.h
1774+++ b/encoder/ratecontrol.h
1775@@ -29,8 +29,8 @@ void x264_ratecontrol_delete( x264_t * );
1776
1777 void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
1778
1779-void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
1780-int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
1781+void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets );
1782+int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets );
1783 int x264_reference_build_list_optimal( x264_t *h );
1784 void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
1785 void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead );
1786diff --git a/x264.h b/x264.h
1787index 95efd88..a4b3400 100644
1788--- a/x264.h
1789+++ b/x264.h
1790@@ -35,7 +35,7 @@
1791
1792 #include <stdarg.h>
1793
1794-#define X264_BUILD 96
1795+#define X264_BUILD 97
1796
1797 /* x264_t:
1798 * opaque handler for encoder */
1799@@ -508,6 +508,22 @@ typedef struct
1800
1801 typedef struct
1802 {
1803+ /* In: an array of quantizer offsets to be applied to this image during encoding.
1804+ * These are added on top of the decisions made by x264.
1805+ * Offsets can be fractional; they are added before QPs are rounded to integer.
1806+ * Adaptive quantization must be enabled to use this feature. Behavior if quant
1807+ * offsets differ between encoding passes is undefined.
1808+ *
1809+ * Array contains one offset per macroblock, in raster scan order. In interlaced
1810+ * mode, top-field MBs and bottom-field MBs are interleaved at the row level. */
1811+ float *quant_offsets;
1812+ /* In: optional callback to free quant_offsets when used.
1813+ * Useful if one wants to use a different quant_offset array for each frame. */
1814+ void (*quant_offsets_free)( void* );
1815+} x264_image_properties_t;
1816+
1817+typedef struct
1818+{
1819 /* In: force picture type (if not auto)
1820 * If x264 encoding parameters are violated in the forcing of picture types,
1821 * x264 will correct the input picture type and log a warning.
1822@@ -537,6 +553,8 @@ typedef struct
1823 x264_param_t *param;
1824 /* In: raw data */
1825 x264_image_t img;
1826+ /* In: optional information to modify encoder decisions for this frame */
1827+ x264_image_properties_t prop;
1828 /* Out: HRD timing information. Output only when i_nal_hrd is set. */
1829 x264_hrd_t hrd_timing;
1830 /* private user data. libx264 doesn't touch this,
1831--
18321.7.0.4
1833
1834
1835From ef05902684b7f2fdfcb07b900740b61248a097e1 Mon Sep 17 00:00:00 2001
1836From: Henrik Gramner <hengar-6@student.ltu.se>
1837Date: Thu, 27 May 2010 22:18:38 +0200
1838Subject: [PATCH 09/10] Optimize out some x264_scan8 reads
1839
1840---
1841 encoder/analyse.c | 15 ++++-----
1842 encoder/macroblock.c | 82 ++++++++++++++++++++++++++++++--------------------
1843 encoder/me.c | 25 ++++++++-------
1844 3 files changed, 70 insertions(+), 52 deletions(-)
1845
1846diff --git a/encoder/analyse.c b/encoder/analyse.c
1847index a128a70..9e85e89 100644
1848--- a/encoder/analyse.c
1849+++ b/encoder/analyse.c
1850@@ -907,8 +907,6 @@ static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
1851 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1852 {
1853 uint8_t *p_dst = h->mb.pic.p_fdec[0];
1854-
1855- int x, y;
1856 uint64_t i_satd, i_best;
1857 h->mb.i_skip_intra = 0;
1858
1859@@ -1031,8 +1029,9 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1860 int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1861
1862 i_best = COST_MAX64;
1863- x = idx&1;
1864- y = idx>>1;
1865+ int x = idx&1;
1866+ int y = idx>>1;
1867+ int s8 = X264_SCAN8_0 + 2*x + 16*y;
1868
1869 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1870 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1871@@ -1061,8 +1060,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1872 if( !(idx&1) )
1873 for( int j = 0; j < 7; j++ )
1874 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1875- i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1876- i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1877+ i_nnz[0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8] );
1878+ i_nnz[1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8] );
1879 }
1880 }
1881 a->i_cbp_i8x8_luma = cbp_luma_new;
1882@@ -1070,8 +1069,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1883 if( !(idx&1) )
1884 for( int j = 0; j < 7; j++ )
1885 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1886- M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1887- M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1888+ M16( &h->mb.cache.non_zero_count[s8 + 0*8] ) = i_nnz[0];
1889+ M16( &h->mb.cache.non_zero_count[s8 + 1*8] ) = i_nnz[1];
1890
1891 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1892 }
1893diff --git a/encoder/macroblock.c b/encoder/macroblock.c
1894index 984f8a8..cdc4563 100644
1895--- a/encoder/macroblock.c
1896+++ b/encoder/macroblock.c
1897@@ -135,11 +135,12 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
1898 }
1899 }
1900
1901-#define STORE_8x8_NNZ(idx,nz)\
1902+#define STORE_8x8_NNZ( s8, nz )\
1903+do\
1904 {\
1905- M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] ) = nz * 0x0101;\
1906- M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] ) = nz * 0x0101;\
1907-}
1908+ M16( &h->mb.cache.non_zero_count[(s8) + 0*8] ) = (nz) * 0x0101;\
1909+ M16( &h->mb.cache.non_zero_count[(s8) + 1*8] ) = (nz) * 0x0101;\
1910+} while(0)
1911
1912 #define CLEAR_16x16_NNZ \
1913 {\
1914@@ -151,17 +152,18 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
1915
1916 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
1917 {
1918- int x = 8 * (idx&1);
1919- int y = 8 * (idx>>1);
1920+ int x = idx&1;
1921+ int y = idx>>1;
1922+ int s8 = X264_SCAN8_0 + 2*x + 16*y;
1923 int nz;
1924- uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
1925- uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
1926+ uint8_t *p_src = &h->mb.pic.p_fenc[0][8*x + 8*y*FENC_STRIDE];
1927+ uint8_t *p_dst = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
1928 ALIGNED_ARRAY_16( int16_t, dct8x8,[64] );
1929
1930 if( h->mb.b_lossless )
1931 {
1932 nz = h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
1933- STORE_8x8_NNZ(idx,nz);
1934+ STORE_8x8_NNZ( s8, nz );
1935 h->mb.i_cbp_luma |= nz<<idx;
1936 return;
1937 }
1938@@ -175,10 +177,10 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
1939 h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
1940 h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
1941 h->dctf.add8x8_idct8( p_dst, dct8x8 );
1942- STORE_8x8_NNZ(idx,1);
1943+ STORE_8x8_NNZ( s8, 1 );
1944 }
1945 else
1946- STORE_8x8_NNZ(idx,0);
1947+ STORE_8x8_NNZ( s8, 0 );
1948 }
1949
1950 static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
1951@@ -728,12 +730,13 @@ void x264_macroblock_encode( x264_t *h )
1952 if( h->mb.b_transform_8x8 )
1953 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
1954 {
1955- int x = 8*(i8x8&1);
1956- int y = 8*(i8x8>>1);
1957- nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
1958- h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
1959- h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
1960- STORE_8x8_NNZ(i8x8,nz);
1961+ int x = i8x8&1;
1962+ int y = i8x8>>1;
1963+ int s8 = X264_SCAN8_0 + 2*x + 16*y;
1964+
1965+ nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8], h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE,
1966+ h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE );
1967+ STORE_8x8_NNZ( s8, nz );
1968 h->mb.i_cbp_luma |= nz << i8x8;
1969 }
1970 else
1971@@ -783,14 +786,18 @@ void x264_macroblock_encode( x264_t *h )
1972 {
1973 for( int idx = 0; idx < 4; idx++ )
1974 {
1975+ int x = idx&1;
1976+ int y = idx>>1;
1977+ int s8 = X264_SCAN8_0 + 2*x + 16*y;
1978+
1979 if( h->mb.i_cbp_luma&(1<<idx) )
1980 {
1981 h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
1982- h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
1983- STORE_8x8_NNZ(idx,1);
1984+ h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
1985+ STORE_8x8_NNZ( s8, 1 );
1986 }
1987 else
1988- STORE_8x8_NNZ(idx,0);
1989+ STORE_8x8_NNZ( s8, 0 );
1990 }
1991 }
1992 }
1993@@ -825,18 +832,24 @@ void x264_macroblock_encode( x264_t *h )
1994 }
1995 }
1996
1997+ int x = i8x8&1;
1998+ int y = i8x8>>1;
1999+
2000 /* decimate this 8x8 block */
2001 i_decimate_mb += i_decimate_8x8;
2002 if( b_decimate )
2003 {
2004 if( i_decimate_8x8 < 4 )
2005- STORE_8x8_NNZ(i8x8,0)
2006+ {
2007+ int s8 = X264_SCAN8_0 + 2*x + 16*y;
2008+ STORE_8x8_NNZ( s8, 0 );
2009+ }
2010 else
2011 h->mb.i_cbp_luma |= 1<<i8x8;
2012 }
2013 else if( cbp )
2014 {
2015- h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
2016+ h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
2017 h->mb.i_cbp_luma |= 1<<i8x8;
2018 }
2019 }
2020@@ -1045,8 +1058,11 @@ void x264_noise_reduction_update( x264_t *h )
2021 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
2022 {
2023 int i_qp = h->mb.i_qp;
2024- uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
2025- uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
2026+ int x = i8&1;
2027+ int y = i8>>1;
2028+ int s8 = X264_SCAN8_0 + 2*x + 16*y;
2029+ uint8_t *p_fenc = h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE;
2030+ uint8_t *p_fdec = h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE;
2031 int b_decimate = h->mb.b_dct_decimate;
2032 int nnz8x8 = 0;
2033 int nz;
2034@@ -1059,7 +1075,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
2035 if( h->mb.b_transform_8x8 )
2036 {
2037 nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
2038- STORE_8x8_NNZ(i8,nnz8x8);
2039+ STORE_8x8_NNZ( s8, nnz8x8 );
2040 }
2041 else
2042 {
2043@@ -1075,8 +1091,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
2044 for( int ch = 0; ch < 2; ch++ )
2045 {
2046 int16_t dc;
2047- p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
2048- p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
2049+ p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
2050+ p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
2051 nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec, &dc );
2052 h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
2053 }
2054@@ -1099,13 +1115,13 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
2055 {
2056 h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
2057 h->dctf.add8x8_idct8( p_fdec, dct8x8 );
2058- STORE_8x8_NNZ(i8,1);
2059+ STORE_8x8_NNZ( s8, 1 );
2060 }
2061 else
2062- STORE_8x8_NNZ(i8,0);
2063+ STORE_8x8_NNZ( s8, 0 );
2064 }
2065 else
2066- STORE_8x8_NNZ(i8,0);
2067+ STORE_8x8_NNZ( s8, 0 );
2068 }
2069 else
2070 {
2071@@ -1132,7 +1148,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
2072 if( nnz8x8 )
2073 h->dctf.add8x8_idct( p_fdec, dct4x4 );
2074 else
2075- STORE_8x8_NNZ(i8,0);
2076+ STORE_8x8_NNZ( s8, 0 );
2077 }
2078
2079 i_qp = h->mb.i_chroma_qp;
2080@@ -1140,8 +1156,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
2081 for( int ch = 0; ch < 2; ch++ )
2082 {
2083 ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
2084- p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
2085- p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
2086+ p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
2087+ p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
2088
2089 h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
2090 dct4x4[0] = 0;
2091diff --git a/encoder/me.c b/encoder/me.c
2092index 77073cc..40d0650 100644
2093--- a/encoder/me.c
2094+++ b/encoder/me.c
2095@@ -937,8 +937,11 @@ int x264_iter_kludge = 0;
2096
2097 static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
2098 {
2099- int16_t *cache0_mv = h->mb.cache.mv[0][x264_scan8[i8*4]];
2100- int16_t *cache1_mv = h->mb.cache.mv[1][x264_scan8[i8*4]];
2101+ int x = i8&1;
2102+ int y = i8>>1;
2103+ int s8 = X264_SCAN8_0 + 2*x + 16*y;
2104+ int16_t *cache0_mv = h->mb.cache.mv[0][s8];
2105+ int16_t *cache1_mv = h->mb.cache.mv[1][s8];
2106 const int i_pixel = m0->i_pixel;
2107 const int bw = x264_pixel_size[i_pixel].w;
2108 const int bh = x264_pixel_size[i_pixel].h;
2109@@ -946,11 +949,11 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
2110 ALIGNED_ARRAY_8( uint8_t, pixu_buf,[2],[9][8*8] );
2111 ALIGNED_ARRAY_8( uint8_t, pixv_buf,[2],[9][8*8] );
2112 uint8_t *src[2][9];
2113- uint8_t *pix = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8];
2114- uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
2115- uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
2116- const int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]];
2117- const int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]];
2118+ uint8_t *pix = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
2119+ uint8_t *pixu = &h->mb.pic.p_fdec[1][4*x + 4*y*FDEC_STRIDE];
2120+ uint8_t *pixv = &h->mb.pic.p_fdec[2][4*x + 4*y*FDEC_STRIDE];
2121+ int ref0 = h->mb.cache.ref[0][s8];
2122+ int ref1 = h->mb.cache.ref[1][s8];
2123 const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
2124 const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
2125 int stride[2][9];
2126@@ -1058,13 +1061,13 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
2127
2128 if( rd )
2129 {
2130- x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
2131+ x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
2132 amvd = pack8to16( X264_MIN(abs(bm0x - m0->mvp[0]),33), X264_MIN(abs(bm0y - m0->mvp[1]),33) );
2133- x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, amvd );
2134+ x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 0, amvd );
2135
2136- x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
2137+ x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
2138 amvd = pack8to16( X264_MIN(abs(bm1x - m1->mvp[0]),33), X264_MIN(abs(bm1y - m1->mvp[1]),33) );
2139- x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, amvd );
2140+ x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 1, amvd );
2141 }
2142
2143 m0->mv[0] = bm0x;
2144--
21451.7.0.4
2146
2147
2148From c949405e834a2cbe35f3fb460eae061447dc386b Mon Sep 17 00:00:00 2001
2149From: Henrik Gramner <hengar-6@student.ltu.se>
2150Date: Sun, 30 May 2010 22:45:14 +0200
2151Subject: [PATCH 10/10] Some deblocking-related optimizations
2152
2153---
2154 common/deblock.c | 8 ++++----
2155 common/macroblock.c | 43 +++++++++++++++++++++++--------------------
2156 2 files changed, 27 insertions(+), 24 deletions(-)
2157
2158diff --git a/common/deblock.c b/common/deblock.c
2159index 27c73ae..3296dbf 100644
2160--- a/common/deblock.c
2161+++ b/common/deblock.c
2162@@ -299,7 +299,7 @@ static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2,
2163 void x264_frame_deblock_row( x264_t *h, int mb_y )
2164 {
2165 int b_interlaced = h->sh.b_mbaff;
2166- int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
2167+ int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->param.analyse.i_chroma_qp_offset );
2168 int stridey = h->fdec->i_stride[0];
2169 int stride2y = stridey << b_interlaced;
2170 int strideuv = h->fdec->i_stride[1];
2171@@ -318,7 +318,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
2172 uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
2173 uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
2174 uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
2175- if( b_interlaced && (mb_y&1) )
2176+ if( mb_y & b_interlaced )
2177 {
2178 pixy -= 15*stridey;
2179 pixu -= 7*strideuv;
2180@@ -366,12 +366,12 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
2181 int qp_top = (qp + qpt + 1) >> 1;
2182 int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1;
2183 int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] );
2184- if( !b_interlaced && (intra_cur || intra_top) )
2185+ if( ~b_interlaced & (intra_cur | intra_top) )
2186 FILTER( _intra, 1, 0, qp_top, qpc_top );
2187 else
2188 {
2189 if( intra_top )
2190- memset( bs[1][0], 3, sizeof(bs[1][0]) );
2191+ M32( bs[1][0] ) = 0x03030303;
2192 FILTER( , 1, 0, qp_top, qpc_top );
2193 }
2194 }
2195diff --git a/common/macroblock.c b/common/macroblock.c
2196index 1b2d37b..7180e8f 100644
2197--- a/common/macroblock.c
2198+++ b/common/macroblock.c
2199@@ -400,9 +400,27 @@ void x264_macroblock_slice_init( x264_t *h )
2200 }
2201 }
2202 }
2203- if( h->sh.i_type == SLICE_TYPE_P )
2204+ else if( h->sh.i_type == SLICE_TYPE_P )
2205+ {
2206 memset( h->mb.cache.skip, 0, sizeof( h->mb.cache.skip ) );
2207
2208+ if( h->sh.i_disable_deblocking_filter_idc != 1 && h->param.analyse.i_weighted_pred )
2209+ {
2210+ deblock_ref_table(-2) = -2;
2211+ deblock_ref_table(-1) = -1;
2212+ for( int i = 0; i < h->i_ref0 << h->sh.b_mbaff; i++ )
2213+ {
2214+ /* Mask off high bits to avoid frame num collisions with -1/-2.
2215+ * In current x264 frame num values don't cover a range of more
2216+ * than 32, so 6 bits is enough for uniqueness. */
2217+ if( !h->mb.b_interlaced )
2218+ deblock_ref_table(i) = h->fref0[i]->i_frame_num&63;
2219+ else
2220+ deblock_ref_table(i) = ((h->fref0[i>>1]->i_frame_num&63)<<1) + (i&1);
2221+ }
2222+ }
2223+ }
2224+
2225 /* init with not available (for top right idx=7,15) */
2226 memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) );
2227
2228@@ -418,19 +436,6 @@ void x264_macroblock_slice_init( x264_t *h )
2229 h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta;
2230 }
2231
2232- deblock_ref_table(-2) = -2;
2233- deblock_ref_table(-1) = -1;
2234- for( int i = 0; i < h->i_ref0 << h->sh.b_mbaff; i++ )
2235- {
2236- /* Mask off high bits to avoid frame num collisions with -1/-2.
2237- * In current x264 frame num values don't cover a range of more
2238- * than 32, so 6 bits is enough for uniqueness. */
2239- if( !h->mb.b_interlaced )
2240- deblock_ref_table(i) = h->fref0[i]->i_frame_num&63;
2241- else
2242- deblock_ref_table(i) = ((h->fref0[i>>1]->i_frame_num&63)<<1) + (i&1);
2243- }
2244-
2245 h->mb.i_neighbour4[6] =
2246 h->mb.i_neighbour4[9] =
2247 h->mb.i_neighbour4[12] =
2248@@ -894,7 +899,6 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
2249 void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
2250 {
2251 int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
2252- int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
2253
2254 h->mb.i_neighbour = 0;
2255 h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
2256@@ -906,9 +910,9 @@ void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_
2257 h->mb.i_neighbour |= MB_LEFT;
2258 }
2259
2260- if( top >= 0 )
2261+ if( mb_y > h->mb.b_interlaced )
2262 {
2263- h->mb.i_mb_top_xy = top;
2264+ h->mb.i_mb_top_xy = h->mb.i_mb_xy - (h->mb.i_mb_stride << h->mb.b_interlaced);
2265 if( deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy] )
2266 h->mb.i_neighbour |= MB_TOP;
2267 }
2268@@ -930,8 +934,6 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
2269 h->mb.i_neighbour &= ~old_neighbour;
2270 if( h->mb.i_neighbour )
2271 {
2272- int left = h->mb.i_mb_left_xy;
2273- int top = h->mb.i_mb_top_xy;
2274 int top_y = mb_y - (1 << h->mb.b_interlaced);
2275 int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
2276 int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
2277@@ -941,10 +943,11 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
2278 uint8_t (*nnz)[24] = h->mb.non_zero_count;
2279
2280 if( h->mb.i_neighbour & MB_TOP )
2281- CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
2282+ CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] );
2283
2284 if( h->mb.i_neighbour & MB_LEFT )
2285 {
2286+ int left = h->mb.i_mb_left_xy;
2287 h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
2288 h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
2289 h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
2290--
22911.7.0.4