· 6 years ago · Nov 29, 2018, 11:56 PM
1# HG changeset patch
2# User Steve Borho <steve@borho.org>
3# Date 1345520003 18000
4# Node ID e5e4a79d3f21a849f65a059ab5b42332a7b4a672
5# Parent bdffc2c1e85bd6cfbef707910bd7a14d844c95c9
6OpenCL Lookahead
7
8diff -r bdffc2c1e85b -r e5e4a79d3f21 Makefile
9--- a/Makefile Wed Jul 18 08:33:41 2012 -0700
10+++ b/Makefile Mon Aug 20 22:33:23 2012 -0500
11@@ -8,6 +8,8 @@
12 vpath %.asm $(SRCPATH)
13 vpath %.rc $(SRCPATH)
14
15+GENERATED =
16+
17 all: default
18 default:
19
20@@ -145,6 +147,37 @@
21 endif
22 endif
23
24+QUOTED_CFLAGS := $(CFLAGS)
25+
26+ifeq ($(HAVE_OPENCL),yes)
27+empty:=
28+space:=$(empty) $(empty)
29+escaped:=\ $(empty)
30+open:=(
31+escopen:=\(
32+close:=)
33+escclose:=\)
34+SAFE_INC_DIR := $(subst $(space),$(escaped),$(OPENCL_INC_DIR))
35+SAFE_INC_DIR := $(subst $(open),$(escopen),$(SAFE_INC_DIR))
36+SAFE_INC_DIR := $(subst $(close),$(escclose),$(SAFE_INC_DIR))
37+SAFE_LIB_DIR := $(subst $(space),$(escaped),$(OPENCL_LIB_DIR))
38+SAFE_LIB_DIR := $(subst $(open),$(escopen),$(SAFE_LIB_DIR))
39+SAFE_LIB_DIR := $(subst $(close),$(escclose),$(SAFE_LIB_DIR))
40+# For normal CFLAGS and LDFLAGS, we must escape spaces with a backslash to
41+# make gcc happy
42+CFLAGS += -I$(SAFE_INC_DIR) -DCL_USE_DEPRECATED_OPENCL_1_1_APIS
43+LDFLAGS += -l$(OPENCL_LIB) -L$(SAFE_LIB_DIR)
44+# For the CFLAGS used by the .depend rule, we must add quotes because
45+# the rule does an extra level of shell expansions
46+QUOTED_CFLAGS += -I"$(OPENCL_INC_DIR)" -DCL_USE_DEPRECATED_OPENCL_1_1_APIS
47+common/oclobj.h: common/opencl/x264-cl.h $(wildcard common/opencl/*.cl)
48+ echo "static const char x264_opencl_source [] = {" > $@
49+ cat $^ | xxd -i >> $@
50+ echo ",0x00 };" >> $@
51+GENERATED += common/oclobj.h
52+SRCS += common/opencl.c encoder/slicetype-cl.c
53+endif
54+
55 OBJS += $(SRCS:%.c=%.o)
56 OBJCLI += $(SRCCLI:%.c=%.o)
57 OBJSO += $(SRCSO:%.c=%.o)
58@@ -164,17 +197,22 @@
59 $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
60
61 ifneq ($(EXE),)
62-.PHONY: x264 checkasm
63+.PHONY: x264 checkasm test-opencl
64 x264: x264$(EXE)
65 checkasm: checkasm$(EXE)
66+test-opencl: test-opencl$(EXE)
67 endif
68
69-x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
70+x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
71 $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
72
73-checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
74+checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
75 $(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
76
77+OBJOCL = tools/test-opencl.o
78+test-opencl$(EXE): .depend ${OBJOCL} $(LIBX264)
79+ $(LD)$@ $(OBJOCL) $(LIBX264) $(LDFLAGS)
80+
81 $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
82
83 %.o: %.asm
84@@ -193,7 +231,7 @@
85
86 .depend: config.mak
87 @rm -f .depend
88- @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
89+ @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(QUOTED_CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
90
91 config.mak:
92 ./configure
93@@ -231,7 +269,7 @@
94
95 clean:
96 rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
97- rm -f checkasm checkasm.exe $(OBJCHK)
98+ rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin
99 rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock
100
101 distclean: clean
102diff -r bdffc2c1e85b -r e5e4a79d3f21 common/common.c
103--- a/common/common.c Wed Jul 18 08:33:41 2012 -0700
104+++ b/common/common.c Mon Aug 20 22:33:23 2012 -0500
105@@ -171,6 +171,10 @@
106 param->b_pic_struct = 0;
107 param->b_fake_interlaced = 0;
108 param->i_frame_packing = -1;
109+#if HAVE_OPENCL
110+ param->b_opencl = 0;
111+ param->psz_clbin_file = NULL;
112+#endif
113 }
114
115 static int x264_param_apply_preset( x264_param_t *param, const char *preset )
116@@ -1029,6 +1033,10 @@
117 p->b_fake_interlaced = atobool(value);
118 OPT("frame-packing")
119 p->i_frame_packing = atoi(value);
120+ OPT("opencl")
121+ p->b_opencl = atobool( value );
122+ OPT("clbin-file")
123+ p->psz_clbin_file = strdup( value );
124 else
125 return X264_PARAM_BAD_NAME;
126 #undef OPT
127diff -r bdffc2c1e85b -r e5e4a79d3f21 common/common.h
128--- a/common/common.h Wed Jul 18 08:33:41 2012 -0700
129+++ b/common/common.h Mon Aug 20 22:33:23 2012 -0500
130@@ -93,6 +93,11 @@
131 #include <assert.h>
132 #include <limits.h>
133
134+#if HAVE_OPENCL
135+# include <CL/cl.h>
136+# include "opencl.h"
137+#endif
138+
139 #if HAVE_INTERLACED
140 # define MB_INTERLACED h->mb.b_interlaced
141 # define SLICE_MBAFF h->sh.b_mbaff
142@@ -947,8 +952,19 @@
143 struct visualize_t *visualize;
144 #endif
145 x264_lookahead_t *lookahead;
146+
147+#if HAVE_OPENCL
148+ x264_opencl_t opencl;
149+#endif
150 };
151
152+#if HAVE_OPENCL
153+int x264_opencl_init( x264_t *h );
154+int x264_opencl_init_lookahead( x264_t *h );
155+void x264_opencl_free( x264_t *h );
156+void x264_opencl_frame_delete( x264_frame_t *frame );
157+#endif
158+
159 // included at the end because it needs x264_t
160 #include "macroblock.h"
161
162diff -r bdffc2c1e85b -r e5e4a79d3f21 common/frame.c
163--- a/common/frame.c Wed Jul 18 08:33:41 2012 -0700
164+++ b/common/frame.c Mon Aug 20 22:33:23 2012 -0500
165@@ -302,6 +302,9 @@
166 x264_free( frame->ref[1] );
167 x264_pthread_mutex_destroy( &frame->mutex );
168 x264_pthread_cond_destroy( &frame->cv );
169+#if HAVE_OPENCL
170+ x264_opencl_frame_delete( frame );
171+#endif
172 }
173 x264_free( frame );
174 }
175diff -r bdffc2c1e85b -r e5e4a79d3f21 common/frame.h
176--- a/common/frame.h Wed Jul 18 08:33:41 2012 -0700
177+++ b/common/frame.h Mon Aug 20 22:33:23 2012 -0500
178@@ -170,6 +170,10 @@
179 /* user frame properties */
180 uint8_t *mb_info;
181 void (*mb_info_free)( void* );
182+
183+#if HAVE_OPENCL
184+ x264_frame_opencl_t opencl;
185+#endif
186 } x264_frame_t;
187
188 /* synchronized frame list */
189diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl.c
190--- /dev/null Thu Jan 01 00:00:00 1970 +0000
191+++ b/common/opencl.c Mon Aug 20 22:33:23 2012 -0500
192@@ -0,0 +1,456 @@
193+/*****************************************************************************
194+ * opencl.c: OpenCL initialization and kernel compilation
195+ *****************************************************************************
196+ * Copyright (C) 2012 x264 project
197+ *
198+ * Authors: Steve Borho <sborho@multicorewareinc.com>
199+ *
200+ * This program is free software; you can redistribute it and/or modify
201+ * it under the terms of the GNU General Public License as published by
202+ * the Free Software Foundation; either version 2 of the License, or
203+ * (at your option) any later version.
204+ *
205+ * This program is distributed in the hope that it will be useful,
206+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
207+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
208+ * GNU General Public License for more details.
209+ *
210+ * You should have received a copy of the GNU General Public License
211+ * along with this program; if not, write to the Free Software
212+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
213+ *
214+ * This program is also available under a commercial proprietary license.
215+ * For more information, contact us at licensing@x264.com.
216+ *****************************************************************************/
217+
218+#include "common.h"
219+
220+#if HAVE_OPENCL
221+
222+#include "oclobj.h"
223+
224+/* Try to load the cached compiled program binary, verify the device context is
225+ * still valid before reuse */
226+static cl_program x264_opencl_cache_load( x264_t *h, char *devname, char *devvendor, char *driverversion )
227+{
228+ cl_program program = NULL;
229+ cl_int status;
230+
231+ /* try to load cached program binary */
232+ FILE *fp = fopen( h->param.psz_clbin_file, "rb" );
233+ if( !fp )
234+ return NULL;
235+
236+ fseek( fp, 0L, SEEK_END );
237+ size_t size = ftell( fp );
238+ rewind( fp );
239+ uint8_t *binary = x264_malloc( size );
240+ if( !binary )
241+ goto fail;
242+
243+ fread( binary, 1, size, fp );
244+ const uint8_t *ptr = (const uint8_t*)binary;
245+
246+#define CHECK_STRING( STR )\
247+ do {\
248+ size_t len = strlen( STR );\
249+ if( size <= len || strncmp( (char*)ptr, STR, len ) )\
250+ goto fail;\
251+ else {\
252+ size -= (len+1); ptr += (len+1);\
253+ }\
254+ } while( 0 )
255+
256+ CHECK_STRING( devname );
257+ CHECK_STRING( devvendor );
258+ CHECK_STRING( driverversion );
259+#undef CHECK_STRING
260+
261+ program = clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status );
262+ if( status != CL_SUCCESS )
263+ program = NULL;
264+
265+fail:
266+ fclose( fp );
267+ x264_free( binary );
268+ return program;
269+}
270+
271+/* Save the compiled program binary to a file for later reuse. Device context
272+ * is also saved in the cache file so we do not reuse stale binaries */
273+static void x264_opencl_cache_save( x264_t *h, cl_program program, char *devname, char *devvendor, char *driverversion )
274+{
275+ FILE *fp = fopen( h->param.psz_clbin_file, "wb" );
276+ if( !fp )
277+ return;
278+
279+ size_t size;
280+ cl_int status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL );
281+ if( status == CL_SUCCESS )
282+ {
283+ unsigned char *binary = x264_malloc( size );
284+ status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &binary, NULL );
285+ if( status == CL_SUCCESS )
286+ {
287+ fwrite( devname, 1, strlen( devname ), fp );
288+ fwrite( "\n", 1, 1, fp );
289+ fwrite( devvendor, 1, strlen( devvendor ), fp );
290+ fwrite( "\n", 1, 1, fp );
291+ fwrite( driverversion, 1, strlen( driverversion ), fp );
292+ fwrite( "\n", 1, 1, fp );
293+ fwrite( binary, 1, size, fp );
294+ }
295+ x264_free( binary );
296+ }
297+ fclose( fp );
298+}
299+
300+static int x264_detect_AMD_GPU( cl_device_id device, int *b_is_SI )
301+{
302+ char extensions[512];
303+ char boardname[128];
304+ char devname[64];
305+
306+ *b_is_SI = 0;
307+
308+ cl_int status = clGetDeviceInfo( device, CL_DEVICE_EXTENSIONS, sizeof(extensions), extensions, NULL );
309+ if( status != CL_SUCCESS || !strstr( extensions, "cl_amd_media_ops" ) )
310+ return 0;
311+
312+ /* Detect whether GPU is a SouthernIsland based device */
313+#define CL_DEVICE_BOARD_NAME_AMD 0x4038
314+ status = clGetDeviceInfo( device, CL_DEVICE_BOARD_NAME_AMD, sizeof(boardname), boardname, NULL );
315+ if( status == CL_SUCCESS )
316+ {
317+ boardname[15] = boardname[16] = boardname[17] = 'X';
318+ *b_is_SI = !strcmp( boardname, "AMD Radeon HD 7XXX Series" );
319+ return 1;
320+ }
321+
322+ /* Fall back to checking the device name */
323+ status = clGetDeviceInfo( device, CL_DEVICE_NAME, sizeof(devname), devname, NULL );
324+ if( status != CL_SUCCESS )
325+ return 1;
326+
327+ const char *tahiti_names[] = { "Tahiti", "Pitcairn", "Capeverde", "Bali", NULL };
328+ for( int i = 0; tahiti_names[i]; i++ )
329+ if( !strcmp( devname, tahiti_names[i] ) )
330+ {
331+ *b_is_SI = 1;
332+ return 1;
333+ }
334+ return 1;
335+}
336+
337+/* The OpenCL source under common/opencl will be merged into encoder/oclobj.h by
338+ * the Makefile. It defines a x264_opencl_source byte array which we will pass
339+ * to clCreateProgramWithSource(). We also attempt to use a cache file for the
340+ * compiled binary, stored in the current working folder. */
341+static cl_program x264_opencl_compile( x264_t *h )
342+{
343+ cl_program program;
344+ cl_int status;
345+
346+ char devname[64];
347+ char devvendor[64];
348+ char driverversion[64];
349+ status = clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(devname), devname, NULL );
350+ status |= clGetDeviceInfo( h->opencl.device, CL_DEVICE_VENDOR, sizeof(devvendor), devvendor, NULL );
351+ status |= clGetDeviceInfo( h->opencl.device, CL_DRIVER_VERSION, sizeof(driverversion), driverversion, NULL );
352+ if( status != CL_SUCCESS )
353+ return NULL;
354+
355+ int b_isamd = x264_detect_AMD_GPU( h->opencl.device, &h->opencl.b_device_AMD_SI );
356+
357+ x264_log( h, X264_LOG_INFO, "OpenCL acceleration enabled with %s %s %s\n", devvendor, devname, h->opencl.b_device_AMD_SI ? "(SI)" : "" );
358+
359+ program = x264_opencl_cache_load( h, devname, devvendor, driverversion );
360+ if( !program )
361+ {
362+ /* clCreateProgramWithSource() requires a pointer variable, you cannot just use &x264_opencl_source */
363+ x264_log( h, X264_LOG_INFO, "Compiling OpenCL kernels...\n" );
364+ const char *strptr = (const char*)x264_opencl_source;
365+ size_t size = sizeof(x264_opencl_source);
366+ program = clCreateProgramWithSource( h->opencl.context, 1, &strptr, &size, &status );
367+ if( status != CL_SUCCESS || !program )
368+ {
369+ x264_log( h, X264_LOG_WARNING, "OpenCL: unable to create program\n" );
370+ return NULL;
371+ }
372+ }
373+
374+ /* Build the program binary for the OpenCL device */
375+ const char *buildopts = "";
376+ if( b_isamd && !h->opencl.b_device_AMD_SI )
377+ buildopts = "-DVECTORIZE=1";
378+ status = clBuildProgram( program, 1, &h->opencl.device, buildopts, NULL, NULL );
379+ if( status == CL_SUCCESS )
380+ {
381+ x264_opencl_cache_save( h, program, devname, devvendor, driverversion );
382+ return program;
383+ }
384+
385+ /* Compile failure, should not happen with production code. */
386+
387+ size_t build_log_len = 0;
388+
389+ status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, NULL, &build_log_len );
390+ if( status != CL_SUCCESS )
391+ {
392+ x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to query build log\n" );
393+ return NULL;
394+ }
395+
396+ char *build_log = x264_malloc( build_log_len );
397+ if( !build_log )
398+ {
399+ x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to alloc build log\n" );
400+ return NULL;
401+ }
402+
403+ status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, build_log, NULL );
404+ if( status != CL_SUCCESS )
405+ {
406+ x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to get build log\n" );
407+ x264_free( build_log );
408+ return NULL;
409+ }
410+
411+ FILE *lg = fopen( "x264_kernel_build_log.txt", "w" );
412+ if( lg )
413+ {
414+ fwrite( build_log, 1, build_log_len, lg );
415+ fclose( lg );
416+ x264_log( h, X264_LOG_WARNING, "OpenCL: kernel build errors written to x264_kernel_build_log.txt\n" );
417+ }
418+
419+ x264_free( build_log );
420+ return NULL;
421+}
422+
423+static void x264_opencl_free_lookahead( x264_t *h )
424+{
425+#define RELEASE( a, f ) if( a ) f( a );
426+ RELEASE( h->opencl.intra_kernel, clReleaseKernel )
427+ RELEASE( h->opencl.rowsum_intra_kernel, clReleaseKernel )
428+ RELEASE( h->opencl.downscale_kernel1, clReleaseKernel )
429+ RELEASE( h->opencl.downscale_kernel2, clReleaseKernel )
430+ RELEASE( h->opencl.downscale_hpel_kernel, clReleaseKernel )
431+ RELEASE( h->opencl.weightp_hpel_kernel, clReleaseKernel )
432+ RELEASE( h->opencl.weightp_scaled_images_kernel, clReleaseKernel )
433+ RELEASE( h->opencl.memset_kernel, clReleaseKernel )
434+ RELEASE( h->opencl.hme_kernel, clReleaseKernel )
435+ RELEASE( h->opencl.subpel_refine_kernel, clReleaseKernel )
436+ RELEASE( h->opencl.mode_select_kernel, clReleaseKernel )
437+ RELEASE( h->opencl.rowsum_inter_kernel, clReleaseKernel )
438+ RELEASE( h->opencl.lookahead_program, clReleaseProgram )
439+ RELEASE( h->opencl.row_satds[0], clReleaseMemObject )
440+ RELEASE( h->opencl.row_satds[1], clReleaseMemObject )
441+ RELEASE( h->opencl.frame_stats[0], clReleaseMemObject )
442+ RELEASE( h->opencl.frame_stats[1], clReleaseMemObject )
443+ RELEASE( h->opencl.mv_buffers[0], clReleaseMemObject )
444+ RELEASE( h->opencl.mv_buffers[1], clReleaseMemObject )
445+ RELEASE( h->opencl.mvp_buffer, clReleaseMemObject )
446+ RELEASE( h->opencl.luma_16x16_image[0], clReleaseMemObject )
447+ RELEASE( h->opencl.luma_16x16_image[1], clReleaseMemObject )
448+ RELEASE( h->opencl.lowres_mv_costs, clReleaseMemObject )
449+ RELEASE( h->opencl.lowres_costs[0], clReleaseMemObject )
450+ RELEASE( h->opencl.lowres_costs[1], clReleaseMemObject )
451+ RELEASE( h->opencl.page_locked_buffer, clReleaseMemObject )
452+ RELEASE( h->opencl.weighted_luma_hpel, clReleaseMemObject )
453+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
454+ {
455+ RELEASE( h->opencl.weighted_scaled_images[i], clReleaseMemObject )
456+ }
457+#undef RELEASE
458+}
459+
460+int x264_opencl_init_lookahead( x264_t *h )
461+{
462+ if( h->param.rc.i_lookahead == 0 )
463+ return -1;
464+
465+ char *kernelnames[] = {
466+ "mb_intra_cost_satd_8x8",
467+ "sum_intra_cost",
468+ "downscale_hpel",
469+ "downscale1",
470+ "downscale2",
471+ "memset_int16",
472+ "weightp_scaled_images",
473+ "weightp_hpel",
474+ "hierarchical_motion",
475+ "subpel_refine",
476+ "mode_selection",
477+ "sum_inter_cost"
478+ };
479+ cl_kernel *kernels[] = {
480+ &h->opencl.intra_kernel,
481+ &h->opencl.rowsum_intra_kernel,
482+ &h->opencl.downscale_hpel_kernel,
483+ &h->opencl.downscale_kernel1,
484+ &h->opencl.downscale_kernel2,
485+ &h->opencl.memset_kernel,
486+ &h->opencl.weightp_scaled_images_kernel,
487+ &h->opencl.weightp_hpel_kernel,
488+ &h->opencl.hme_kernel,
489+ &h->opencl.subpel_refine_kernel,
490+ &h->opencl.mode_select_kernel,
491+ &h->opencl.rowsum_inter_kernel
492+ };
493+ cl_int status;
494+
495+ h->opencl.lookahead_program = x264_opencl_compile( h );
496+ if( !h->opencl.lookahead_program )
497+ {
498+ x264_opencl_free_lookahead( h );
499+ return -1;
500+ }
501+
502+ for( int i = 0; i < sizeof(kernelnames)/sizeof(char*); i++ )
503+ {
504+ *kernels[i] = clCreateKernel( h->opencl.lookahead_program, kernelnames[i], &status );
505+ if( status != CL_SUCCESS )
506+ {
507+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to compile kernel '%s' (%d)\n", kernelnames[i], status );
508+ x264_opencl_free_lookahead( h );
509+ return -1;
510+ }
511+ }
512+
513+ h->opencl.page_locked_buffer = clCreateBuffer( h->opencl.context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status );
514+ if( status != CL_SUCCESS )
515+ {
516+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to allocate page-locked buffer, error '%d'\n", status );
517+ return -1;
518+ }
519+ h->opencl.page_locked_ptr = clEnqueueMapBuffer( h->opencl.queue, h->opencl.page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
520+ 0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status );
521+ if( status != CL_SUCCESS )
522+ {
523+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to map page-locked buffer, error '%d'\n", status );
524+ return -1;
525+ }
526+
527+ return 0;
528+}
529+
530+static void x264_opencl_error_notify( const char *errinfo, const void *private_info, size_t cb, void *user_data )
531+{
532+ /* Any error notification can be assumed to be fatal to the OpenCL context.
533+ * We need to stop using it immediately and switch over to the CPU for
534+ * lookahead
535+ */
536+ x264_t *h = (x264_t*)user_data;
537+ h->param.b_opencl = 0;
538+ x264_log( h, X264_LOG_ERROR, "OpenCL: %s\n", errinfo );
539+}
540+
541+int x264_opencl_init( x264_t *h )
542+{
543+ cl_int status;
544+ cl_uint numPlatforms;
545+ int ret = -1;
546+
547+ status = clGetPlatformIDs( 0, NULL, &numPlatforms );
548+ if( status != CL_SUCCESS || numPlatforms == 0 )
549+ {
550+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n");
551+ return -1;
552+ }
553+
554+ cl_platform_id *platforms = (cl_platform_id*)x264_malloc( numPlatforms * sizeof(cl_platform_id) );
555+ status = clGetPlatformIDs( numPlatforms, platforms, NULL );
556+ if( status != CL_SUCCESS )
557+ {
558+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n");
559+ x264_free( platforms );
560+ return -1;
561+ }
562+
563+ /* Select the first OpenCL platform that supports a GPU device that supports our
564+ * required image (texture) formats
565+ */
566+ for( cl_uint i = 0; i < numPlatforms; ++i )
567+ {
568+ cl_uint gpu_count;
569+ status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &gpu_count );
570+ if( status == CL_SUCCESS && gpu_count > 0 )
571+ {
572+ /* take GPU 0 */
573+ status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 1, &h->opencl.device, NULL );
574+ if( status != CL_SUCCESS )
575+ continue;
576+
577+ h->opencl.context = clCreateContext( NULL, 1, &h->opencl.device, (void*)x264_opencl_error_notify, (void*)h, &status );
578+ if( status != CL_SUCCESS )
579+ continue;
580+
581+ cl_bool image_support;
582+ clGetDeviceInfo( h->opencl.device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL );
583+ if( !image_support )
584+ continue;
585+
586+#define MAX_IMAGE_TYPES 100
587+ cl_uint count = 0;
588+ cl_image_format imageType[MAX_IMAGE_TYPES];
589+ clGetSupportedImageFormats( h->opencl.context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, MAX_IMAGE_TYPES, imageType, &count );
590+ count = X264_MIN( count, MAX_IMAGE_TYPES );
591+
592+ int b_has_r = 0;
593+ int b_has_rgba = 0;
594+ for( cl_uint j = 0; j < count; j++ )
595+ {
596+ if( imageType[j].image_channel_order == CL_R )
597+ b_has_r = 1;
598+ else if( imageType[j].image_channel_order == CL_RGBA )
599+ b_has_rgba = 1;
600+ }
601+ if( !b_has_r || !b_has_rgba )
602+ continue;
603+
604+ h->opencl.queue = clCreateCommandQueue( h->opencl.context, h->opencl.device, 0, &status );
605+ if( status != CL_SUCCESS )
606+ continue;
607+
608+ ret = 0;
609+ break;
610+ }
611+ }
612+
613+ x264_free( platforms );
614+
615+ if( !h->param.psz_clbin_file )
616+ h->param.psz_clbin_file = "x264_lookahead.clbin";
617+
618+ if( !ret )
619+ ret = x264_opencl_init_lookahead( h );
620+
621+ return ret;
622+}
623+
624+void x264_opencl_frame_delete( x264_frame_t *frame )
625+{
626+#define RELEASEBUF(mem) if( mem ) clReleaseMemObject( mem );
627+ for( int j = 0; j < NUM_IMAGE_SCALES; j++ )
628+ RELEASEBUF( frame->opencl.scaled_image2Ds[j] );
629+ RELEASEBUF( frame->opencl.luma_hpel );
630+ RELEASEBUF( frame->opencl.inv_qscale_factor );
631+ RELEASEBUF( frame->opencl.intra_cost );
632+ RELEASEBUF( frame->opencl.lowres_mvs0 );
633+ RELEASEBUF( frame->opencl.lowres_mvs1 );
634+ RELEASEBUF( frame->opencl.lowres_mv_costs0 );
635+ RELEASEBUF( frame->opencl.lowres_mv_costs1 );
636+#undef RELEASEBUF
637+}
638+
639+void x264_opencl_free( x264_t *h )
640+{
641+ x264_opencl_free_lookahead( h );
642+
643+ if( h->opencl.queue )
644+ clReleaseCommandQueue( h->opencl.queue );
645+ if( h->opencl.context )
646+ clReleaseContext( h->opencl.context );
647+}
648+#endif /* HAVE_OPENCL */
649diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl.h
650--- /dev/null Thu Jan 01 00:00:00 1970 +0000
651+++ b/common/opencl.h Mon Aug 20 22:33:23 2012 -0500
652@@ -0,0 +1,103 @@
653+/*****************************************************************************
654
655+ * opencl.h: OpenCL structures and defines
656
657+ *****************************************************************************
658
659+ * Copyright (C) 2012 x264 project
660
661+ *
662
663+ * Authors: Steve Borho <sborho@multicorewareinc.com>
664
665+ *
666
667+ * This program is free software; you can redistribute it and/or modify
668
669+ * it under the terms of the GNU General Public License as published by
670
671+ * the Free Software Foundation; either version 2 of the License, or
672
673+ * (at your option) any later version.
674
675+ *
676
677+ * This program is distributed in the hope that it will be useful,
678
679+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
680
681+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
682
683+ * GNU General Public License for more details.
684
685+ *
686
687+ * You should have received a copy of the GNU General Public License
688
689+ * along with this program; if not, write to the Free Software
690
691+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
692
693+ *
694
695+ * This program is also available under a commercial proprietary license.
696
697+ * For more information, contact us at licensing@x264.com.
698
699+ *****************************************************************************/
700
701+
702
703+#ifndef X264_OPENCL_H
704
705+#define X264_OPENCL_H
706
707+
708
709+#include "common/common.h"
710
711+
712
713+#define NUM_IMAGE_SCALES 4
714
715+
716
717+#define MAX_FINISH_COPIES 1024
718
719+#define PAGE_LOCKED_BUF_SIZE 32 * 1024 * 1024
720
721+
722
723+typedef struct
724
725+{
726
727+ cl_context context;
728
729+ cl_device_id device;
730
731+ cl_command_queue queue;
732
733+
734
735+ cl_program lookahead_program;
736
737+ cl_int last_buf;
738
739+
740
741+ cl_mem page_locked_buffer;
742
743+ char *page_locked_ptr;
744
745+ int pl_occupancy;
746
747+
748
749+ struct
750
751+ {
752
753+ void *src;
754
755+ void *dest;
756
757+ int bytes;
758
759+ } copies[MAX_FINISH_COPIES];
760
761+ int num_copies;
762
763+
764
765+ int b_device_AMD_SI;
766
767+
768
769+ /* downscale lowres luma */
770
771+ cl_kernel downscale_hpel_kernel;
772
773+ cl_kernel downscale_kernel1;
774
775+ cl_kernel downscale_kernel2;
776
777+ cl_mem luma_16x16_image[2];
778
779+
780
781+ /* weightp filtering */
782
783+ cl_kernel weightp_hpel_kernel;
784
785+ cl_kernel weightp_scaled_images_kernel;
786
787+ cl_mem weighted_scaled_images[NUM_IMAGE_SCALES];
788
789+ cl_mem weighted_luma_hpel;
790
791+
792
793+ /* intra */
794
795+ cl_kernel memset_kernel;
796
797+ cl_kernel intra_kernel;
798
799+ cl_kernel rowsum_intra_kernel;
800
801+ cl_mem row_satds[2];
802
803+
804
805+ /* hierarchical motion estimation */
806
807+ cl_kernel hme_kernel;
808
809+ cl_kernel subpel_refine_kernel;
810
811+ cl_mem mv_buffers[2];
812
813+ cl_mem lowres_mv_costs;
814
815+ cl_mem mvp_buffer;
816
817+
818
819+ /* bidir */
820
821+ cl_kernel mode_select_kernel;
822
823+ cl_kernel rowsum_inter_kernel;
824
825+ cl_mem lowres_costs[2];
826
827+ cl_mem frame_stats[2]; /* cost_est, cost_est_aq, intra_mbs */
828
829+} x264_opencl_t;
830
831+
832
833+typedef struct
834
835+{
836
837+ cl_mem scaled_image2Ds[NUM_IMAGE_SCALES];
838
839+ cl_mem luma_hpel;
840
841+ cl_mem inv_qscale_factor;
842
843+ cl_mem intra_cost;
844
845+ cl_mem lowres_mvs0;
846
847+ cl_mem lowres_mvs1;
848
849+ cl_mem lowres_mv_costs0;
850
851+ cl_mem lowres_mv_costs1;
852
853+} x264_frame_opencl_t;
854
855+
856
857+#endif
858
859diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl/bidir.cl
860--- /dev/null Thu Jan 01 00:00:00 1970 +0000
861+++ b/common/opencl/bidir.cl Mon Aug 20 22:33:23 2012 -0500
862@@ -0,0 +1,259 @@
863+
864+/* Four threads cooperatively measure 8x8 BIDIR cost with SATD */
865+int bidir_satd_8x8_ii_coop4(
866+ read_only image2d_t fenc_lowres, int2 fencpos,
867+ read_only image2d_t fref0_planes, int2 qpos0,
868+ read_only image2d_t fref1_planes, int2 qpos1,
869+ int weight,
870+ local sum2_t *tmpp,
871+ int idx )
872+{
873+ volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
874+ sum2_t b0, b1, b2, b3;
875+ sum2_t sum = 0;
876+
877+ // fencpos is full-pel position of original MB
878+ // qpos0 is qpel position within reference frame 0
879+ // qpos1 is qpel position within reference frame 1
880+
881+ int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2);
882+ int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2);
883+
884+ int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1));
885+ int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2);
886+ int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2);
887+
888+ int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2);
889+ int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2);
890+
891+ int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1));
892+ int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2);
893+ int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2);
894+
895+ uint mask_shift0A = 8 * hpel0A, mask_shift0B = 8 * hpel0B;
896+ uint mask_shift1A = 8 * hpel1A, mask_shift1B = 8 * hpel1B;
897+
898+ uint vA, vB;
899+ uint enc, ref0, ref1;
900+ uint a0, a1;
901+ const int weight2 = 64 - weight;
902+
903+#define READ_BIDIR_DIFF( OUT, X )\
904+ enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\
905+ vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 >> mask_shift0A) & 0xFF;\
906+ vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 >> mask_shift0B) & 0xFF;\
907+ ref0 = rhadd( vA, vB );\
908+ vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 >> mask_shift1A) & 0xFF;\
909+ vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 >> mask_shift1B) & 0xFF;\
910+ ref1 = rhadd( vA, vB );\
911+ OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6);
912+
913+#define READ_DIFF_EX( OUT, a, b )\
914+ READ_BIDIR_DIFF( a0, a );\
915+ READ_BIDIR_DIFF( a1, b );\
916+ OUT = a0 + (a1<<BITS_PER_SUM);
917+
918+#define ROW_8x4_SATD( a, b, c )\
919+ fencpos.y += a;\
920+ fref0Apos.y += b;\
921+ fref0Bpos.y += b;\
922+ fref1Apos.y += c;\
923+ fref1Bpos.y += c;\
924+ READ_DIFF_EX( b0, 0, 4 );\
925+ READ_DIFF_EX( b1, 1, 5 );\
926+ READ_DIFF_EX( b2, 2, 6 );\
927+ READ_DIFF_EX( b3, 3, 7 );\
928+ HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
929+ HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
930+ sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );
931+
932+ ROW_8x4_SATD( 0, 0, 0 );
933+ ROW_8x4_SATD( 4, 4, 4 );
934+
935+#undef READ_BIDIR_DIFF
936+#undef READ_DIFF_EX
937+#undef ROW_8x4_SATD
938+
939+ return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
940+}
941+
942+/*
943+ * mode selection - pick the least cost partition type for each 8x8 macroblock.
944+ * Intra, list0 or list1. If encoding a B frame, test three bidir
945+ * possibilities.
946+ *
947+ * fenc_lowres_mvs[0|1] and fenc_lowres_mv_costs[0|1] are large buffers that
948+ * hold many frames worth of motion vectors. We must offset into the correct
949+ * location for this frame's vectors:
950+ *
951+ * CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
952+ * GPU equivalent: fenc_lowres_mvs0[(b - p0 - 1) * mb_count]
953+ *
954+ * global launch dimensions: [mb_width, mb_height]
955+ *
956+ * If this is a B frame, launch dims are [mb_width * 4, mb_height]
957+ */
958+kernel void mode_selection(
959+ read_only image2d_t fenc_lowres,
960+ read_only image2d_t fref0_planes,
961+ read_only image2d_t fref1_planes,
962+ const global short2 *fenc_lowres_mvs0,
963+ const global short2 *fenc_lowres_mvs1,
964+ const global short2 *fref1_lowres_mvs0,
965+ const global int16_t *fenc_lowres_mv_costs0,
966+ const global int16_t *fenc_lowres_mv_costs1,
967+ const global uint16_t *fenc_intra_cost,
968+ global uint16_t *lowres_costs,
969+ global int *frame_stats,
970+ local int16_t *cost_local,
971+ local sum2_t *satd_local,
972+ int mb_width,
973+ int bipred_weight,
974+ int dist_scale_factor,
975+ int b,
976+ int p0,
977+ int p1,
978+ int lambda )
979+{
980+ int mb_x = get_global_id( 0 );
981+ int b_bidir = b < p1;
982+ if( b_bidir )
983+ {
984+ mb_x >>= 2;
985+ if( mb_x >= mb_width )
986+ return;
987+ }
988+ int mb_y = get_global_id( 1 );
989+ int mb_height = get_global_size( 1 );
990+ int mb_count = mb_width * mb_height;
991+ int mb_xy = mb_x + mb_y * mb_width;
992+
993+ /* Initialize int frame_stats[4] for next kernel (sum_inter_cost) */
994+ if( mb_x < 4 && mb_y == 0 )
995+ frame_stats[mb_x] = 0;
996+
997+ int bcost = COST_MAX;
998+ int list_used = 0;
999+
1000+ if( !b_bidir )
1001+ {
1002+ int icost = fenc_intra_cost[mb_xy];
1003+ COPY2_IF_LT( bcost, icost, list_used, 0 );
1004+ }
1005+ if( b != p0 )
1006+ {
1007+ int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy];
1008+ COPY2_IF_LT( bcost, mv_cost0, list_used, 1 );
1009+ }
1010+ if( b != p1 )
1011+ {
1012+ int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy];
1013+ COPY2_IF_LT( bcost, mv_cost1, list_used, 2 );
1014+ }
1015+
1016+ if( b_bidir )
1017+ {
1018+ int2 coord = (int2)(mb_x << 3, mb_y << 3);
1019+ int mb_i = get_global_id( 0 ) & 3;
1020+ int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
1021+ cost_local += mb_in_group * 4;
1022+ satd_local += mb_in_group * 16;
1023+
1024+#define TRY_BIDIR( mv0, mv1, penalty )\
1025+ {\
1026+ int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\
1027+ int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\
1028+ cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\
1029+ int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
1030+ COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\
1031+ }
1032+ /* temporal prediction */
1033+ short2 dmv0, dmv1;
1034+ short2 mvr = fref1_lowres_mvs0[mb_xy];
1035+ dmv0.x = (mvr.x * dist_scale_factor + 128) >> 8;
1036+ dmv0.y = (mvr.y * dist_scale_factor + 128) >> 8;
1037+ dmv1.x = dmv0.x - mvr.x;
1038+ dmv1.y = dmv0.y - mvr.y;
1039+ TRY_BIDIR( dmv0, dmv1, 0 )
1040+
1041+ if( as_uint( dmv0 ) || as_uint( dmv1 ) )
1042+ {
1043+ /* B-direct prediction */
1044+ dmv0 = 0; dmv1 = 0;
1045+ TRY_BIDIR( dmv0, dmv1, 0 );
1046+ }
1047+
1048+ /* L0+L1 prediction */
1049+ dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy];
1050+ dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy];
1051+ TRY_BIDIR( dmv0, dmv1, 5 );
1052+#undef TRY_BIDIR
1053+ }
1054+
1055+ lowres_costs[mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
1056+}
1057+
1058+/*
1059+ * parallel sum inter costs
1060+ *
1061+ * global launch dimensions: [256, mb_height]
1062+ */
1063+kernel void sum_inter_cost(
1064+ const global uint16_t *fenc_lowres_costs,
1065+ const global uint16_t *inv_qscale_factor,
1066+ global int *fenc_row_satds,
1067+ global int *frame_stats,
1068+ int mb_width,
1069+ int bframe_bias,
1070+ int b,
1071+ int p0,
1072+ int p1 )
1073+{
1074+ int y = get_global_id( 1 );
1075+ int mb_height = get_global_size( 1 );
1076+
1077+ int row_satds = 0;
1078+ int cost_est = 0;
1079+ int cost_est_aq = 0;
1080+ int intra_mbs = 0;
1081+
1082+ for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
1083+ {
1084+ int mb_xy = x + y * mb_width;
1085+ int cost = fenc_lowres_costs[mb_xy] & LOWRES_COST_MASK;
1086+ int list = fenc_lowres_costs[mb_xy] >> LOWRES_COST_SHIFT;
1087+ int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
1088+
1089+ if( list == 0 && b_frame_score_mb )
1090+ intra_mbs++;
1091+
1092+ int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
1093+
1094+ row_satds += cost_aq;
1095+
1096+ if( b_frame_score_mb )
1097+ {
1098+ cost_est += cost;
1099+ cost_est_aq += cost_aq;
1100+ }
1101+ }
1102+
1103+ local int buffer[256];
1104+ int x = get_global_id( 0 );
1105+
1106+ row_satds = parallel_sum( row_satds, x, buffer );
1107+ cost_est = parallel_sum( cost_est, x, buffer );
1108+ cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
1109+ intra_mbs = parallel_sum( intra_mbs, x, buffer );
1110+
1111+ if( b != p1 )
1112+ cost_est = (int)((float)cost_est * 100 / (120 + bframe_bias));
1113+
1114+ if( get_global_id( 0 ) == 0 )
1115+ {
1116+ fenc_row_satds[y] = row_satds;
1117+ atomic_add( frame_stats + COST_EST, cost_est );
1118+ atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
1119+ atomic_add( frame_stats + INTRA_MBS, intra_mbs );
1120+ }
1121+}
1122diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl/downscale.cl
1123--- /dev/null Thu Jan 01 00:00:00 1970 +0000
1124+++ b/common/opencl/downscale.cl Mon Aug 20 22:33:23 2012 -0500
1125@@ -0,0 +1,136 @@
1126+/*
1127+ * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image
1128+ *
1129+ * --
1130+ *
1131+ * fenc_img is an output image (area of memory referenced through a texture
1132+ * cache). A read of any pixel location (x,y) returns four pixel values:
1133+ *
1134+ * val.s0 = P(x,y)
1135+ * val.s1 = P(x+1,y)
1136+ * val.s2 = P(x+2,y)
1137+ * val.s3 = P(x+3,y)
1138+ *
1139+ * This is a 4x replication of the lowres pixels, a trade-off between memory
1140+ * size and read latency.
1141+ *
1142+ * --
1143+ *
1144+ * hpel_planes is an output image that contains the four HPEL planes used for
1145+ * subpel refinement. A read of any pixel location (x,y) returns a UInt32 with
1146+ * the four planar values C | V | H | F
1147+ *
1148+ * launch dimensions: [lowres-width, lowres-height]
1149+ */
1150+kernel void downscale_hpel(
1151+ const global pixel *fenc,
1152+ write_only image2d_t fenc_img,
1153+ write_only image2d_t hpel_planes,
1154+ int stride )
1155+{
1156+ int x = get_global_id( 0 );
1157+ int y = get_global_id( 1 );
1158+ uint4 values;
1159+
1160+ fenc += y * stride * 2;
1161+ const global pixel *src1 = fenc + stride;
1162+ const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride;
1163+ int2 pos = (int2)(x, y);
1164+ pixel right, left;
1165+
1166+ right = rhadd( fenc[x*2], src1[x*2] );
1167+ left = rhadd( fenc[x*2+1], src1[x*2+1] );
1168+ values.s0 = rhadd( right, left ); // F
1169+
1170+ right = rhadd( fenc[2*x+1], src1[2*x+1] );
1171+ left = rhadd( fenc[2*x+2], src1[2*x+2] );
1172+ values.s1 = rhadd( right, left ); // H
1173+
1174+ right = rhadd( src1[2*x], src2[2*x] );
1175+ left = rhadd( src1[2*x+1], src2[2*x+1] );
1176+ values.s2 = rhadd( right, left ); // V
1177+
1178+ right = rhadd( src1[2*x+1], src2[2*x+1] );
1179+ left = rhadd( src1[2*x+2], src2[2*x+2] );
1180+ values.s3 = rhadd( right, left ); // C
1181+
1182+ uint4 val = (uint4) ((values.s3 & 0xff) << 24) | ((values.s2 & 0xff) << 16) | ((values.s1 & 0xff) << 8) | (values.s0 & 0xff);
1183+ write_imageui( hpel_planes, pos, val );
1184+
1185+ x = x+1 < get_global_size( 0 ) ? x+1 : x;
1186+ right = rhadd( fenc[x*2], src1[x*2] );
1187+ left = rhadd( fenc[x*2+1], src1[x*2+1] );
1188+ values.s1 = rhadd( right, left );
1189+
1190+ x = x+1 < get_global_size( 0 ) ? x+1 : x;
1191+ right = rhadd( fenc[x*2], src1[x*2] );
1192+ left = rhadd( fenc[x*2+1], src1[x*2+1] );
1193+ values.s2 = rhadd( right, left );
1194+
1195+ x = x+1 < get_global_size( 0 ) ? x+1 : x;
1196+ right = rhadd( fenc[x*2], src1[x*2] );
1197+ left = rhadd( fenc[x*2+1], src1[x*2+1] );
1198+ values.s3 = rhadd( right, left );
1199+
1200+ write_imageui( fenc_img, pos, values );
1201+}
1202+
1203+/*
1204+ * downscale lowres hierarchical motion search image, copy from one image to
1205+ * another decimated image. This kernel is called iteratively to generate all
1206+ * of the downscales.
1207+ *
1208+ * launch dimensions: [lower_res width, lower_res height]
1209+ */
1210+kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res )
1211+{
1212+ int x = get_global_id( 0 );
1213+ int y = get_global_id( 1 );
1214+ int2 pos = (int2)(x, y);
1215+ int gs = get_global_size( 0 );
1216+ uint4 top, bot, values;
1217+ top = read_imageui( higher_res, sampler, (int2)( x*2, 2*y ) );
1218+ bot = read_imageui( higher_res, sampler, (int2)( x*2, 2*y+1 ) );
1219+ values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
1220+
1221+ /* these select statements appear redundant, and they should be, but tests break when
1222+ * they are not here. I believe this was caused by a driver bug
1223+ */
1224+ values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
1225+ top = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y ) );
1226+ bot = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y+1 ) );
1227+ values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
1228+ values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
1229+ write_imageui( lower_res, pos, (uint4)(values) );
1230+}
1231+
1232+/*
1233+ * Second copy of downscale kernel, no differences. This is a (no perf loss)
1234+ * workaround for a scheduling bug in current Tahiti drivers. This bug has
1235+ * theoretically been fixed in the July 2012 driver release from AMD.
1236+ */
1237+kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res )
1238+{
1239+ int x = get_global_id( 0 );
1240+ int y = get_global_id( 1 );
1241+ int2 pos = (int2)(x, y);
1242+ int gs = get_global_size( 0 );
1243+ uint4 top, bot, values;
1244+ top = read_imageui( higher_res, sampler, (int2)( x*2, 2*y ) );
1245+ bot = read_imageui( higher_res, sampler, (int2)( x*2, 2*y+1 ) );
1246+ values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
1247+
1248+ // see comment in above function copy
1249+ values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
1250+ top = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y ) );
1251+ bot = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y+1 ) );
1252+ values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
1253+ values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
1254+ write_imageui( lower_res, pos, (uint4)(values) );
1255+}
1256+
1257+/* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */
1258+kernel void memset_int16( global int16_t *buf, int16_t value )
1259+{
1260+ buf[get_global_id( 0 )] = value;
1261+}
1262diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl/intra.cl
1263--- /dev/null Thu Jan 01 00:00:00 1970 +0000
1264+++ b/common/opencl/intra.cl Mon Aug 20 22:33:23 2012 -0500
1265@@ -0,0 +1,1073 @@
1266+/* Lookahead lowres intra analysis
1267+ *
1268+ * Each intra analysis function has been implemented twice, once for scalar GPUs
1269+ * (NV) and once for vectorized GPUs (AMD pre-Southern Islands). x264 detects
1270+ * the GPU type and sets the -DVECTORIZE compile flag accordingly.
1271+ *
1272+ * All the intra analysis functions were based on their C versions.
1273+ */
1274+
1275+#if VECTORIZE
1276+int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 )
1277+{
1278+ int8 a_v, d_v;
1279+ int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13;
1280+ int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33;
1281+
1282+ d_v = convert_int8( vload8( 0, data ) );
1283+ a_v.s01234567 = (d_v - pr0).s04152637;
1284+ HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
1285+
1286+ data += data_stride;
1287+ d_v = convert_int8( vload8( 0, data ) );
1288+ a_v.s01234567 = (d_v - pr1).s04152637;
1289+ HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
1290+
1291+ data += data_stride;
1292+ d_v = convert_int8( vload8( 0, data ) );
1293+ a_v.s01234567 = (d_v - pr2).s04152637;
1294+ HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
1295+
1296+ data += data_stride;
1297+ d_v = convert_int8( vload8( 0, data ) );
1298+ a_v.s01234567 = (d_v - pr3).s04152637;
1299+ HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
1300+
1301+ uint8 sum_v;
1302+
1303+ HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 );
1304+ sum_v = abs( a_v );
1305+
1306+ HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 );
1307+ sum_v += abs( a_v );
1308+
1309+ HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 );
1310+ sum_v += abs( a_v );
1311+
1312+ HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 );
1313+ sum_v += abs( a_v );
1314+
1315+ uint4 sum2 = sum_v.hi + sum_v.lo;
1316+ uint2 sum3 = sum2.hi + sum2.lo;
1317+ return ( sum3.hi + sum3.lo ) >> 1;
1318+}
1319+#else
1320+SATD_C_8x4_Q( satd_8x4_lp, const local, private )
1321+#endif
1322+
1323+/****************************************************************************
1324+ * 8x8 prediction for intra luma block
1325+ ****************************************************************************/
1326+
1327+#define F1 rhadd
1328+#define F2( a, b, c ) ( a+2*b+c+2 )>>2
1329+
1330+#if VECTORIZE
1331+int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
1332+{
1333+ int8 pr0, pr1, pr2, pr3;
1334+
1335+ // Upper half of pred[]
1336+ pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2;
1337+ pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
1338+ pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
1339+ pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
1340+ pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
1341+ pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
1342+ pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
1343+ pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1344+
1345+ pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
1346+ pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
1347+ pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
1348+ pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
1349+ pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
1350+ pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
1351+ pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1352+ pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
1353+
1354+ pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
1355+ pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
1356+ pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
1357+ pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
1358+ pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
1359+ pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1360+ pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
1361+ pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
1362+
1363+ pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
1364+ pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
1365+ pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
1366+ pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
1367+ pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1368+ pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
1369+ pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
1370+ pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
1371+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
1372+
1373+ // Lower half of pred[]
1374+ pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
1375+ pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
1376+ pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
1377+ pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1378+ pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
1379+ pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
1380+ pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
1381+ pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
1382+
1383+ pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
1384+ pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
1385+ pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1386+ pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
1387+ pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
1388+ pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
1389+ pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
1390+ pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
1391+
1392+ pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
1393+ pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1394+ pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
1395+ pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
1396+ pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
1397+ pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
1398+ pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
1399+ pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
1400+
1401+ pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1402+ pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
1403+ pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
1404+ pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
1405+ pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
1406+ pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
1407+ pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
1408+ pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2;
1409+
1410+ return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
1411+}
1412+
1413+int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
1414+{
1415+ int8 pr0, pr1, pr2, pr3;
1416+
1417+ // Upper half of pred[]
1418+ pr3.s0 = F2( left[1], left[2], left[3] );
1419+ pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] );
1420+ pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top );
1421+ pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
1422+ pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
1423+ pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
1424+ pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
1425+ pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
1426+ pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] );
1427+ pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] );
1428+ pr0.s7 = F2( top[5], top[6], top[7] );
1429+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
1430+
1431+ // Lower half of pred[]
1432+ pr3.s0 = F2( left[5], left[6], left[7] );
1433+ pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] );
1434+ pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] );
1435+ pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] );
1436+ pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] );
1437+ pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] );
1438+ pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top );
1439+ pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] );
1440+ pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] );
1441+ pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] );
1442+ pr0.s7 = F2( top[1], top[2], top[3] );
1443+ return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
1444+}
1445+
1446+int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
1447+{
1448+ int8 pr0, pr1, pr2, pr3;
1449+
1450+ // Upper half of pred[]
1451+ pr2.s0 = F2( left[1], left[0], left_top );
1452+ pr3.s0 = F2( left[2], left[1], left[0] );
1453+ pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] );
1454+ pr0.s0 = pr2.s1 = F1( left_top, top[0] );
1455+ pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] );
1456+ pr0.s1 = pr2.s2 = F1( top[0], top[1] );
1457+ pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] );
1458+ pr0.s2 = pr2.s3 = F1( top[1], top[2] );
1459+ pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] );
1460+ pr0.s3 = pr2.s4 = F1( top[2], top[3] );
1461+ pr1.s4 = pr3.s5 = F2( top[2], top[3], top[4] );
1462+ pr0.s4 = pr2.s5 = F1( top[3], top[4] );
1463+ pr1.s5 = pr3.s6 = F2( top[3], top[4], top[5] );
1464+ pr0.s5 = pr2.s6 = F1( top[4], top[5] );
1465+ pr1.s6 = pr3.s7 = F2( top[4], top[5], top[6] );
1466+ pr0.s6 = pr2.s7 = F1( top[5], top[6] );
1467+ pr1.s7 = F2( top[5], top[6], top[7] );
1468+ pr0.s7 = F1( top[6], top[7] );
1469+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
1470+
1471+ // Lower half of pred[]
1472+ pr2.s0 = F2( left[5], left[4], left[3] );
1473+ pr3.s0 = F2( left[6], left[5], left[4] );
1474+ pr0.s0 = pr2.s1 = F2( left[3], left[2], left[1] );
1475+ pr1.s0 = pr3.s1 = F2( left[4], left[3], left[2] );
1476+ pr0.s1 = pr2.s2 = F2( left[1], left[0], left_top );
1477+ pr1.s1 = pr3.s2 = F2( left[2], left[1], left[0] );
1478+ pr1.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
1479+ pr0.s2 = pr2.s3 = F1( left_top, top[0] );
1480+ pr1.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
1481+ pr0.s3 = pr2.s4 = F1( top[0], top[1] );
1482+ pr1.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
1483+ pr0.s4 = pr2.s5 = F1( top[1], top[2] );
1484+ pr1.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
1485+ pr0.s5 = pr2.s6 = F1( top[2], top[3] );
1486+ pr1.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
1487+ pr0.s6 = pr2.s7 = F1( top[3], top[4] );
1488+ pr1.s7 = F2( top[3], top[4], top[5] );
1489+ pr0.s7 = F1( top[4], top[5] );
1490+ return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
1491+#undef PRED
1492+}
1493+
1494+int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
1495+{
1496+ int8 pr0, pr1, pr2, pr3;
1497+
1498+ // Upper half of pred[]
1499+ pr0.s0 = F1( left_top, left[0] ); pr0.s1 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
1500+ pr0.s2 = F2( top[1], top[0], left_top ); pr0.s3 = F2( top[2], top[1], top[0] );
1501+ pr0.s4 = F2( top[3], top[2], top[1] ); pr0.s5 = F2( top[4], top[3], top[2] );
1502+ pr0.s6 = F2( top[5], top[4], top[3] ); pr0.s7 = F2( top[6], top[5], top[4] );
1503+
1504+ pr1.s0 = F1( left[0], left[1] ); pr1.s1 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
1505+ pr1.s2 = F1( left_top, left[0] ); pr1.s3 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
1506+ pr1.s4 = F2( top[1], top[0], left_top ); pr1.s5 = F2( top[2], top[1], top[0] );
1507+ pr1.s6 = F2( top[3], top[2], top[1] ); pr1.s7 = F2( top[4], top[3], top[2] );
1508+
1509+ pr2.s0 = F1( left[1], left[2] ); pr2.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
1510+ pr2.s2 = F1( left[0], left[1] ); pr2.s3 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
1511+ pr2.s4 = F1( left_top, left[0] ); pr2.s5 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
1512+ pr2.s6 = F2( top[1], top[0], left_top ); pr2.s7 = F2( top[2], top[1], top[0] );
1513+
1514+ pr3.s0 = F1( left[2], left[3] ); pr3.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
1515+ pr3.s2 = F1( left[1], left[2] ); pr3.s3 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
1516+ pr3.s4 = F1( left[0], left[1] ); pr3.s5 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
1517+ pr3.s6 = F1( left_top, left[0] ); pr3.s7 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
1518+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
1519+
1520+ // Lower half of pred[]
1521+ pr0.s0 = F1( left[3], left[4] ); pr0.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
1522+ pr0.s2 = F1( left[2], left[3] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
1523+ pr0.s4 = F1( left[1], left[2] ); pr0.s5 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
1524+ pr0.s6 = F1( left[0], left[1] ); pr0.s7 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
1525+
1526+ pr1.s0 = F1( left[4], left[5] ); pr1.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
1527+ pr1.s2 = F1( left[3], left[4] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
1528+ pr1.s4 = F1( left[2], left[3] ); pr1.s5 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
1529+ pr1.s6 = F1( left[1], left[2] ); pr1.s7 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
1530+
1531+ pr2.s0 = F1( left[5], left[6] ); pr2.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
1532+ pr2.s2 = F1( left[4], left[5] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
1533+ pr2.s4 = F1( left[3], left[4] ); pr2.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
1534+ pr2.s6 = F1( left[2], left[3] ); pr2.s7 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
1535+
1536+ pr3.s0 = F1( left[6], left[7] ); pr3.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
1537+ pr3.s2 = F1( left[5], left[6] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
1538+ pr3.s4 = F1( left[4], left[5] ); pr3.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
1539+ pr3.s6 = F1( left[3], left[4] ); pr3.s7 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
1540+ return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
1541+}
1542+
1543+int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
1544+{
1545+ int8 pr0, pr1, pr2, pr3;
1546+
1547+ // Upper half of pred[]
1548+ pr0.s0 = F1( top[0], top[1] );
1549+ pr1.s0 = F2( top[0], top[1], top[2] );
1550+ pr2.s0 = pr0.s1 = F1( top[1], top[2] );
1551+ pr3.s0 = pr1.s1 = F2( top[1], top[2], top[3] );
1552+ pr2.s1 = pr0.s2 = F1( top[2], top[3] );
1553+ pr3.s1 = pr1.s2 = F2( top[2], top[3], top[4] );
1554+ pr2.s2 = pr0.s3 = F1( top[3], top[4] );
1555+ pr3.s2 = pr1.s3 = F2( top[3], top[4], top[5] );
1556+ pr2.s3 = pr0.s4 = F1( top[4], top[5] );
1557+ pr3.s3 = pr1.s4 = F2( top[4], top[5], top[6] );
1558+ pr2.s4 = pr0.s5 = F1( top[5], top[6] );
1559+ pr3.s4 = pr1.s5 = F2( top[5], top[6], top[7] );
1560+ pr2.s5 = pr0.s6 = F1( top[6], top[7] );
1561+ pr3.s5 = pr1.s6 = F2( top[6], top[7], top[8] );
1562+ pr2.s6 = pr0.s7 = F1( top[7], top[8] );
1563+ pr3.s6 = pr1.s7 = F2( top[7], top[8], top[9] );
1564+ pr2.s7 = F1( top[8], top[9] );
1565+ pr3.s7 = F2( top[8], top[9], top[10] );
1566+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
1567+
1568+ // Lower half of pred[]
1569+ pr0.s0 = F1( top[2], top[3] );
1570+ pr1.s0 = F2( top[2], top[3], top[4] );
1571+ pr2.s0 = pr0.s1 = F1( top[3], top[4] );
1572+ pr3.s0 = pr1.s1 = F2( top[3], top[4], top[5] );
1573+ pr2.s1 = pr0.s2 = F1( top[4], top[5] );
1574+ pr3.s1 = pr1.s2 = F2( top[4], top[5], top[6] );
1575+ pr2.s2 = pr0.s3 = F1( top[5], top[6] );
1576+ pr3.s2 = pr1.s3 = F2( top[5], top[6], top[7] );
1577+ pr2.s3 = pr0.s4 = F1( top[6], top[7] );
1578+ pr3.s3 = pr1.s4 = F2( top[6], top[7], top[8] );
1579+ pr2.s4 = pr0.s5 = F1( top[7], top[8] );
1580+ pr3.s4 = pr1.s5 = F2( top[7], top[8], top[9] );
1581+ pr2.s5 = pr0.s6 = F1( top[8], top[9] );
1582+ pr3.s5 = pr1.s6 = F2( top[8], top[9], top[10] );
1583+ pr2.s6 = pr0.s7 = F1( top[9], top[10] );
1584+ pr3.s6 = pr1.s7 = F2( top[9], top[10], top[11] );
1585+ pr2.s7 = F1( top[10], top[11] );
1586+ pr3.s7 = F2( top[10], top[11], top[12] );
1587+ return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
1588+}
1589+
1590+int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
1591+{
1592+ int8 pr0, pr1, pr2, pr3;
1593+
1594+ // Upper half of pred[]
1595+ pr0.s0 = F1( left[0], left[1] ); pr0.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
1596+ pr0.s2 = F1( left[1], left[2] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
1597+ pr0.s4 = F1( left[2], left[3] ); pr0.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
1598+ pr0.s6 = F1( left[3], left[4] ); pr0.s7 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
1599+
1600+ pr1.s0 = F1( left[1], left[2] ); pr1.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
1601+ pr1.s2 = F1( left[2], left[3] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
1602+ pr1.s4 = F1( left[3], left[4] ); pr1.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
1603+ pr1.s6 = F1( left[4], left[5] ); pr1.s7 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
1604+
1605+ pr2.s0 = F1( left[2], left[3] ); pr2.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
1606+ pr2.s2 = F1( left[3], left[4] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
1607+ pr2.s4 = F1( left[4], left[5] ); pr2.s5 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
1608+ pr2.s6 = F1( left[5], left[6] ); pr2.s7 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
1609+
1610+ pr3.s0 = F1( left[3], left[4] ); pr3.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
1611+ pr3.s2 = F1( left[4], left[5] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
1612+ pr3.s4 = F1( left[5], left[6] ); pr3.s5 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
1613+ pr3.s6 = F1( left[6], left[7] ); pr3.s7 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
1614+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
1615+
1616+ // Lower half of pred[]
1617+ pr0.s0 = F1( left[4], left[5] ); pr0.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
1618+ pr0.s2 = F1( left[5], left[6] ); pr0.s3 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
1619+ pr0.s4 = F1( left[6], left[7] ); pr0.s5 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
1620+ pr0.s6 = left[7]; pr0.s7 = left[7];
1621+
1622+ pr1.s0 = F1( left[5], left[6] ); pr1.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
1623+ pr1.s2 = F1( left[6], left[7] ); pr1.s3 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
1624+ pr1.s4 = left[7]; pr1.s5 = left[7];
1625+ pr1.s6 = left[7]; pr1.s7 = left[7];
1626+
1627+ pr2.s0 = F1( left[6], left[7] ); pr2.s1 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
1628+ pr2.s2 = left[7]; pr2.s3 = left[7];
1629+ pr2.s4 = left[7]; pr2.s5 = left[7];
1630+ pr2.s6 = left[7]; pr2.s7 = left[7];
1631+
1632+ pr3 = (int8)left[7];
1633+
1634+ return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
1635+}
1636+
1637+int x264_predict_8x8c_h( const local pixel *src, int src_stride )
1638+{
1639+ const local pixel *src_l = src;
1640+ int8 pr0, pr1, pr2, pr3;
1641+
1642+ // Upper half of pred[]
1643+ pr0 = (int8)src[-1]; src += src_stride;
1644+ pr1 = (int8)src[-1]; src += src_stride;
1645+ pr2 = (int8)src[-1]; src += src_stride;
1646+ pr3 = (int8)src[-1]; src += src_stride;
1647+ int satd = satd_8x4_intra_lr( src_l, src_stride, pr0, pr1, pr2, pr3 );
1648+
1649+ //Lower half of pred[]
1650+ pr0 = (int8)src[-1]; src += src_stride;
1651+ pr1 = (int8)src[-1]; src += src_stride;
1652+ pr2 = (int8)src[-1]; src += src_stride;
1653+ pr3 = (int8)src[-1];
1654+ return satd + satd_8x4_intra_lr( src_l + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
1655+}
1656+
1657+int x264_predict_8x8c_v( const local pixel *src, int src_stride )
1658+{
1659+ int8 pred = convert_int8( vload8( 0, &src[-src_stride] ));
1660+ return satd_8x4_intra_lr( src, src_stride, pred, pred, pred, pred ) +
1661+ satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pred, pred, pred, pred );
1662+}
1663+
1664+int x264_predict_8x8c_p( const local pixel *src, int src_stride )
1665+{
1666+ int H = 0, V = 0;
1667+ for( int i = 0; i < 4; i++ )
1668+ {
1669+ H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
1670+ V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
1671+ }
1672+
1673+ int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
1674+ int b = (17 * H + 16) >> 5;
1675+ int c = (17 * V + 16) >> 5;
1676+ int i00 = a - 3 * b - 3 * c + 16;
1677+
1678+ // Upper half of pred[]
1679+ int pix = i00;
1680+ int8 pr0, pr1, pr2, pr3;
1681+ pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1682+ pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1683+ pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1684+ pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1685+ pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1686+ pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1687+ pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1688+ pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1689+
1690+ pix = i00;
1691+ pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1692+ pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1693+ pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1694+ pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1695+ pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1696+ pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1697+ pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1698+ pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1699+
1700+ pix = i00;
1701+ pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1702+ pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1703+ pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1704+ pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1705+ pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1706+ pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1707+ pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1708+ pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1709+
1710+ pix = i00;
1711+ pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1712+ pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1713+ pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1714+ pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1715+ pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1716+ pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1717+ pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1718+ pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1719+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
1720+
1721+ //Lower half of pred[]
1722+ pix = i00;
1723+ pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1724+ pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1725+ pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1726+ pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1727+ pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1728+ pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1729+ pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1730+ pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1731+
1732+ pix = i00;
1733+ pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1734+ pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1735+ pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1736+ pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1737+ pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1738+ pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1739+ pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1740+ pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1741+
1742+ pix = i00;
1743+ pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1744+ pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1745+ pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1746+ pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1747+ pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1748+ pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1749+ pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1750+ pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1751+
1752+ pix = i00;
1753+ pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1754+ pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1755+ pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1756+ pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1757+ pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1758+ pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1759+ pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1760+ pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1761+ return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
1762+}
1763+
1764+int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
1765+{
1766+ int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
1767+ for( int i = 0; i < 4; i++ )
1768+ {
1769+ s0 += src[i - src_stride];
1770+ s1 += src[i + 4 - src_stride];
1771+ s2 += src[-1 + i * src_stride];
1772+ s3 += src[-1 + (i+4)*src_stride];
1773+ }
1774+
1775+ // Upper half of pred[]
1776+ int8 dc0;
1777+ dc0.lo = (int4)( (s0 + s2 + 4) >> 3 );
1778+ dc0.hi = (int4)( (s1 + 2) >> 2 );
1779+ int satd = satd_8x4_intra_lr( src, src_stride, dc0, dc0, dc0, dc0 );
1780+
1781+ // Lower half of pred[]
1782+ dc0.lo = (int4)( (s3 + 2) >> 2 );
1783+ dc0.hi = (int4)( (s1 + s3 + 4) >> 3 );
1784+ return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, dc0, dc0, dc0, dc0 );
1785+}
1786+
1787+#else /* not vectorized: private is cheap registers are scarce */
1788+
1789+int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
1790+{
1791+ private pixel pred[32];
1792+ int x_plus_y;
1793+
1794+ // Upper half of pred[]
1795+ for( int y = 0; y < 4; y++ )
1796+ {
1797+ for( int x = 0; x < 8; x++ )
1798+ {
1799+ x_plus_y = clamp( x + y, 0, 13 );
1800+ pred[x + y*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
1801+ }
1802+ }
1803+ int satd = satd_8x4_lp( src, src_stride, pred, 8 );
1804+ //Lower half of pred[]
1805+ for( int y = 4; y < 8; y++ )
1806+ {
1807+ for( int x = 0; x < 8; x++ )
1808+ {
1809+ x_plus_y = clamp( x + y, 0, 13 );
1810+ pred[x + ( y - 4 )*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
1811+ }
1812+ }
1813+ pred[31] = ( 2 + top[14] + 3*top[15] ) >> 2;
1814+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
1815+ return satd;
1816+}
1817+
1818+int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
1819+{
1820+ private pixel pred[32];
1821+#define PRED( x, y ) pred[(x) + (y)*8]
1822+ // Upper half of pred[]
1823+ PRED( 0, 3 ) = F2( left[1], left[2], left[3] );
1824+ PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[0], left[1], left[2] );
1825+ PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[1], left[0], left_top );
1826+ PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
1827+ PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
1828+ PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
1829+ PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
1830+ PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
1831+ PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( top[3], top[4], top[5] );
1832+ PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[4], top[5], top[6] );
1833+ PRED( 7, 0 ) = F2( top[5], top[6], top[7] );
1834+ int satd = satd_8x4_lp( src, src_stride, pred, 8 );
1835+
1836+ // Lower half of pred[]
1837+ PRED( 0, 3 ) = F2( left[5], left[6], left[7] );
1838+ PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[4], left[5], left[6] );
1839+ PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[3], left[4], left[5] );
1840+ PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[2], left[3], left[4] );
1841+ PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left[1], left[2], left[3] );
1842+ PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( left[0], left[1], left[2] );
1843+ PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( left[1], left[0], left_top );
1844+ PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( left[0], left_top, top[0] );
1845+ PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( left_top, top[0], top[1] );
1846+ PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[0], top[1], top[2] );
1847+ PRED( 7, 0 ) = F2( top[1], top[2], top[3] );
1848+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
1849+ return satd;
1850+#undef PRED
1851+}
1852+
1853+int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
1854+{
1855+ private pixel pred[32];
1856+#define PRED( x, y ) pred[(x) + (y)*8]
1857+ // Upper half of pred[]
1858+ PRED( 0, 2 ) = F2( left[1], left[0], left_top );
1859+ PRED( 0, 3 ) = F2( left[2], left[1], left[0] );
1860+ PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[0], left_top, top[0] );
1861+ PRED( 0, 0 ) = PRED( 1, 2 ) = F1( left_top, top[0] );
1862+ PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left_top, top[0], top[1] );
1863+ PRED( 1, 0 ) = PRED( 2, 2 ) = F1( top[0], top[1] );
1864+ PRED( 2, 1 ) = PRED( 3, 3 ) = F2( top[0], top[1], top[2] );
1865+ PRED( 2, 0 ) = PRED( 3, 2 ) = F1( top[1], top[2] );
1866+ PRED( 3, 1 ) = PRED( 4, 3 ) = F2( top[1], top[2], top[3] );
1867+ PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[2], top[3] );
1868+ PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[2], top[3], top[4] );
1869+ PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[3], top[4] );
1870+ PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[3], top[4], top[5] );
1871+ PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[4], top[5] );
1872+ PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[4], top[5], top[6] );
1873+ PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[5], top[6] );
1874+ PRED( 7, 1 ) = F2( top[5], top[6], top[7] );
1875+ PRED( 7, 0 ) = F1( top[6], top[7] );
1876+ int satd = satd_8x4_lp( src, src_stride, pred, 8 );
1877+
1878+ //Lower half of pred[]
1879+ PRED( 0, 2 ) = F2( left[5], left[4], left[3] );
1880+ PRED( 0, 3 ) = F2( left[6], left[5], left[4] );
1881+ PRED( 0, 0 ) = PRED( 1, 2 ) = F2( left[3], left[2], left[1] );
1882+ PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[4], left[3], left[2] );
1883+ PRED( 1, 0 ) = PRED( 2, 2 ) = F2( left[1], left[0], left_top );
1884+ PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left[2], left[1], left[0] );
1885+ PRED( 2, 1 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
1886+ PRED( 2, 0 ) = PRED( 3, 2 ) = F1( left_top, top[0] );
1887+ PRED( 3, 1 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
1888+ PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[0], top[1] );
1889+ PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
1890+ PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[1], top[2] );
1891+ PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
1892+ PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[2], top[3] );
1893+ PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
1894+ PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[3], top[4] );
1895+ PRED( 7, 1 ) = F2( top[3], top[4], top[5] );
1896+ PRED( 7, 0 ) = F1( top[4], top[5] );
1897+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
1898+ return satd;
1899+#undef PRED
1900+}
1901+
1902+inline uint32_t pack16to32( uint32_t a, uint32_t b )
1903+{
1904+ return a + (b << 16);
1905+}
1906+
1907+inline uint32_t pack8to16( uint32_t a, uint32_t b )
1908+{
1909+ return a + (b << 8);
1910+}
1911+
1912+int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
1913+{
1914+ private pixel pred[32];
1915+ int satd;
1916+ int p1 = pack8to16( (F1( left[6], left[7] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
1917+ int p2 = pack8to16( (F1( left[5], left[6] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
1918+ int p3 = pack8to16( (F1( left[4], left[5] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
1919+ int p4 = pack8to16( (F1( left[3], left[4] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
1920+ int p5 = pack8to16( (F1( left[2], left[3] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
1921+ int p6 = pack8to16( (F1( left[1], left[2] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
1922+ int p7 = pack8to16( (F1( left[0], left[1] )), ((left_top + 2 * left[0] + left[1] + 2) >> 2) );
1923+ int p8 = pack8to16( (F1( left_top, left[0] )), ((left[0] + 2 * left_top + top[0] + 2) >> 2) );
1924+ int p9 = pack8to16( (F2( top[1], top[0], left_top )), (F2( top[2], top[1], top[0] )) );
1925+ int p10 = pack8to16( (F2( top[3], top[2], top[1] )), (F2( top[4], top[3], top[2] )) );
1926+ int p11 = pack8to16( (F2( top[5], top[4], top[3] )), (F2( top[6], top[5], top[4] )) );
1927+ // Upper half of pred[]
1928+ vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[0 + 0 * 8] );
1929+ vstore4( as_uchar4( pack16to32( p10, p11 ) ), 0, &pred[4 + 0 * 8] );
1930+ vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[0 + 1 * 8] );
1931+ vstore4( as_uchar4( pack16to32( p9, p10 ) ), 0, &pred[4 + 1 * 8] );
1932+ vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[0 + 2 * 8] );
1933+ vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[4 + 2 * 8] );
1934+ vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[0 + 3 * 8] );
1935+ vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[4 + 3 * 8] );
1936+ satd = satd_8x4_lp( src, src_stride, pred, 8 );
1937+ // Lower half of pred[]
1938+ vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[0 + 0 * 8] );
1939+ vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[4 + 0 * 8] );
1940+ vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[0 + 1 * 8] );
1941+ vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[4 + 1 * 8] );
1942+ vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[0 + 2 * 8] );
1943+ vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[4 + 2 * 8] );
1944+ vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[0 + 3 * 8] );
1945+ vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[4 + 3 * 8] );
1946+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
1947+ return satd;
1948+}
1949+
1950+int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
1951+{
1952+ private pixel pred[32];
1953+ int satd;
1954+#define PRED( x, y ) pred[(x) + (y)*8]
1955+ // Upper half of pred[]
1956+ PRED( 0, 0 ) = F1( top[0], top[1] );
1957+ PRED( 0, 1 ) = F2( top[0], top[1], top[2] );
1958+ PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[1], top[2] );
1959+ PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[1], top[2], top[3] );
1960+ PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[2], top[3] );
1961+ PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[2], top[3], top[4] );
1962+ PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[3], top[4] );
1963+ PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[3], top[4], top[5] );
1964+ PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[4], top[5] );
1965+ PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[4], top[5], top[6] );
1966+ PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[5], top[6] );
1967+ PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[5], top[6], top[7] );
1968+ PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[6], top[7] );
1969+ PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[6], top[7], top[8] );
1970+ PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[7], top[8] );
1971+ PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[7], top[8], top[9] );
1972+ PRED( 7, 2 ) = F1( top[8], top[9] );
1973+ PRED( 7, 3 ) = F2( top[8], top[9], top[10] );
1974+ satd = satd_8x4_lp( src, src_stride, pred, 8 );
1975+ // Lower half of pred[]
1976+ PRED( 0, 0 ) = F1( top[2], top[3] );
1977+ PRED( 0, 1 ) = F2( top[2], top[3], top[4] );
1978+ PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[3], top[4] );
1979+ PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[3], top[4], top[5] );
1980+ PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[4], top[5] );
1981+ PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[4], top[5], top[6] );
1982+ PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[5], top[6] );
1983+ PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[5], top[6], top[7] );
1984+ PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[6], top[7] );
1985+ PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[6], top[7], top[8] );
1986+ PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[7], top[8] );
1987+ PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[7], top[8], top[9] );
1988+ PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[8], top[9] );
1989+ PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[8], top[9], top[10] );
1990+ PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[9], top[10] );
1991+ PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[9], top[10], top[11] );
1992+ PRED( 7, 2 ) = F1( top[10], top[11] );
1993+ PRED( 7, 3 ) = F2( top[10], top[11], top[12] );
1994+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
1995+ return satd;
1996+#undef PRED
1997+}
1998+
1999+int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
2000+{
2001+ private pixel pred[32];
2002+ int satd;
2003+ int p1 = pack8to16( (F1( left[0], left[1] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
2004+ int p2 = pack8to16( (F1( left[1], left[2] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
2005+ int p3 = pack8to16( (F1( left[2], left[3] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
2006+ int p4 = pack8to16( (F1( left[3], left[4] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
2007+ int p5 = pack8to16( (F1( left[4], left[5] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
2008+ int p6 = pack8to16( (F1( left[5], left[6] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
2009+ int p7 = pack8to16( (F1( left[6], left[7] )), ((left[6] + 2 * left[7] + left[7] + 2) >> 2) );
2010+ int p8 = pack8to16( left[7], left[7] );
2011+ // Upper half of pred[]
2012+ vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
2013+ vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
2014+ vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
2015+ vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
2016+ vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
2017+ vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
2018+ vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
2019+ vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
2020+ satd = satd_8x4_lp( src, src_stride, pred, 8 );
2021+ // Lower half of pred[]
2022+ vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
2023+ vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
2024+ vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
2025+ vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
2026+ vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
2027+ vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
2028+ vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
2029+ vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
2030+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
2031+ return satd;
2032+}
2033+
2034+int x264_predict_8x8c_h( const local pixel *src, int src_stride )
2035+{
2036+ private pixel pred[32];
2037+ const local pixel *src_l = src;
2038+
2039+ // Upper half of pred[]
2040+ vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
2041+ vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
2042+ vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
2043+ vstore8( (uchar8)(src[-1]), 3, pred ); src += src_stride;
2044+ int satd = satd_8x4_lp( src_l, src_stride, pred, 8 );
2045+
2046+ // Lower half of pred[]
2047+ vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
2048+ vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
2049+ vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
2050+ vstore8( (uchar8)(src[-1]), 3, pred );
2051+ return satd + satd_8x4_lp( src_l + ( src_stride << 2 ), src_stride, pred, 8 );
2052+}
2053+
2054+int x264_predict_8x8c_v( const local pixel *src, int src_stride )
2055+{
2056+ private pixel pred[32];
2057+ uchar16 v16;
2058+ v16.lo = vload8( 0, &src[-src_stride] );
2059+ v16.hi = vload8( 0, &src[-src_stride] );
2060+
2061+ vstore16( v16, 0, pred );
2062+ vstore16( v16, 1, pred );
2063+
2064+ return satd_8x4_lp( src, src_stride, pred, 8 ) +
2065+ satd_8x4_lp( src + (src_stride << 2), src_stride, pred, 8 );
2066+}
2067+
2068+int x264_predict_8x8c_p( const local pixel *src, int src_stride )
2069+{
2070+ int H = 0, V = 0;
2071+ private pixel pred[32];
2072+ int satd;
2073+
2074+ for( int i = 0; i < 4; i++ )
2075+ {
2076+ H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
2077+ V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
2078+ }
2079+
2080+ int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
2081+ int b = (17 * H + 16) >> 5;
2082+ int c = (17 * V + 16) >> 5;
2083+ int i00 = a - 3 * b - 3 * c + 16;
2084+
2085+ // Upper half of pred[]
2086+ for( int y = 0; y < 4; y++ )
2087+ {
2088+ int pix = i00;
2089+ for( int x = 0; x < 8; x++ )
2090+ {
2091+ pred[x + y*8] = x264_clip_pixel( pix >> 5 );
2092+ pix += b;
2093+ }
2094+ i00 += c;
2095+ }
2096+ satd = satd_8x4_lp( src, src_stride, pred, 8 );
2097+ // Lower half of pred[]
2098+ for( int y = 0; y < 4; y++ )
2099+ {
2100+ int pix = i00;
2101+ for( int x = 0; x < 8; x++ )
2102+ {
2103+ pred[x + y*8] = x264_clip_pixel( pix >> 5 );
2104+ pix += b;
2105+ }
2106+ i00 += c;
2107+ }
2108+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
2109+ return satd;
2110+}
2111+
2112+int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
2113+{
2114+ private pixel pred[32];
2115+ int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
2116+ for( int i = 0; i < 4; i++ )
2117+ {
2118+ s0 += src[i - src_stride];
2119+ s1 += src[i + 4 - src_stride];
2120+ s2 += src[-1 + i * src_stride];
2121+ s3 += src[-1 + (i+4)*src_stride];
2122+ }
2123+
2124+ // Upper half of pred[]
2125+ uchar8 dc0;
2126+ dc0.lo = (uchar4)( (s0 + s2 + 4) >> 3 );
2127+ dc0.hi = (uchar4)( (s1 + 2) >> 2 );
2128+ vstore8( dc0, 0, pred );
2129+ vstore8( dc0, 1, pred );
2130+ vstore8( dc0, 2, pred );
2131+ vstore8( dc0, 3, pred );
2132+ int satd = satd_8x4_lp( src, src_stride, pred, 8 );
2133+
2134+ // Lower half of pred[]
2135+ dc0.lo = (uchar4)( (s3 + 2) >> 2 );
2136+ dc0.hi = (uchar4)( (s1 + s3 + 4) >> 3 );
2137+ vstore8( dc0, 0, pred );
2138+ vstore8( dc0, 1, pred );
2139+ vstore8( dc0, 2, pred );
2140+ vstore8( dc0, 3, pred );
2141+ return satd + satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
2142+}
2143+#endif
2144+
2145+/* Find the least cost intra mode for 32 8x8 macroblocks per workgroup
2146+ *
2147+ * Loads 33 macroblocks plus the pixels directly above them into local memory,
2148+ * padding where necessary with edge pixels. It then cooperatively calculates
2149+ * smoothed top and left pixels for use in some of the analysis.
2150+ *
2151+ * Then groups of 32 threads each calculate a single intra mode for each 8x8
2152+ * block. Since consecutive threads are calculating the same intra mode there
2153+ * is no code-path divergence. 8 intra costs are calculated simultaneously. If
2154+ * the "slow" argument is not zero, the final two (least likely) intra modes are
2155+ * tested in a second pass. The slow mode is only enabled for presets slow,
2156+ * slower, and placebo.
2157+ *
2158+ * Local dimension: [ 32 * 8, 8 ]
2159+ * Global dimensions: [ paddedWidth, height ]
2160+ */
2161+
2162+kernel void mb_intra_cost_satd_8x8(
2163+ read_only image2d_t fenc,
2164+ global uint16_t *fenc_intra_cost,
2165+ global int *frame_stats,
2166+ int lambda,
2167+ int mb_width,
2168+ int slow )
2169+{
2170+#define CACHE_STRIDE (265)
2171+#define BLOCK_OFFSET (266)
2172+ local pixel cache[2385];
2173+ local int cost_buf[32];
2174+ local pixel top[32 * 16];
2175+ local pixel left[32 * 8];
2176+ local pixel left_top[32];
2177+
2178+ int lx = get_local_id( 0 );
2179+ int ly = get_local_id( 1 );
2180+ int gx = get_global_id( 0 );
2181+ int gy = get_global_id( 1 );
2182+ int gidx = get_group_id( 0 );
2183+ int gidy = get_group_id( 1 );
2184+ int linear_id = ly * get_local_size( 0 ) + lx;
2185+ int satd = COST_MAX;
2186+ int basex = gidx << 8;
2187+ int basey = (gidy << 3) - 1;
2188+
2189+ for( int y = 0; y < 9; y++ ) //Maybe unrolled later?
2190+ {
2191+ for( int x = 4*linear_id; x < (33 * 8); x += 1024 )
2192+ {
2193+ uint4 data = read_imageui( fenc, sampler, (int2)(x + basex, y + basey) );
2194+ cache[y * CACHE_STRIDE + 1 + x] = data.s0;
2195+ cache[y * CACHE_STRIDE + 1 + x + 1] = data.s1;
2196+ cache[y * CACHE_STRIDE + 1 + x + 2] = data.s2;
2197+ cache[y * CACHE_STRIDE + 1 + x + 3] = data.s3;
2198+ }
2199+ }
2200+ if( linear_id < 9 )
2201+ cache[linear_id * CACHE_STRIDE] = read_imageui( fenc, sampler, (int2)( basex - 1, linear_id + basey) ).s0;
2202+
2203+ barrier( CLK_LOCAL_MEM_FENCE );
2204+
2205+ // Cooperatively build the top edge for the macroblock using smoothing function
2206+ int j = ly;
2207+ top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp( j - 1, -1, 15 )] +
2208+ 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp( j, 0, 15 )] +
2209+ cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp( j + 1, 0, 15 )] + 2 ) >> 2;
2210+ j += 8;
2211+ top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp( j - 1, -1, 15 )] +
2212+ 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp( j, 0, 15 )] +
2213+ cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp( j + 1, 0, 15 )] + 2 ) >> 2;
2214+ // Cooperatively build the left edge for the macroblock using smoothing function
2215+ left[lx*8 + ly] = ( cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*(ly - 1)] +
2216+ 2*cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*ly] +
2217+ cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*clamp((ly + 1), 0, 7 )] + 2 ) >> 2;
2218+ // One left_top per macroblock
2219+ if( 0 == ly )
2220+ {
2221+ left_top[lx] = ( cache[BLOCK_OFFSET + 8*lx - 1] + 2*cache[BLOCK_OFFSET + 8*lx - 1 - CACHE_STRIDE] +
2222+ cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE] + 2 ) >> 2;
2223+ cost_buf[lx] = COST_MAX;
2224+ }
2225+ barrier( CLK_LOCAL_MEM_FENCE );
2226+
2227+ // each warp/wavefront generates a different prediction type; no divergence
2228+ switch( ly )
2229+ {
2230+ case 0:
2231+ satd = x264_predict_8x8c_h( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
2232+ break;
2233+ case 1:
2234+ satd = x264_predict_8x8c_v( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
2235+ break;
2236+ case 2:
2237+ satd = x264_predict_8x8c_dc( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
2238+ break;
2239+ case 3:
2240+ satd = x264_predict_8x8c_p( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
2241+ break;
2242+ case 4:
2243+ satd = x264_predict_8x8_ddr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
2244+ break;
2245+ case 5:
2246+ satd = x264_predict_8x8_vr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
2247+ break;
2248+ case 6:
2249+ satd = x264_predict_8x8_hd( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
2250+ break;
2251+ case 7:
2252+ satd = x264_predict_8x8_hu( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &left[8*lx] );
2253+ break;
2254+ default:
2255+ break;
2256+ }
2257+ atom_min( &cost_buf[lx], satd );
2258+ if( slow )
2259+ {
2260+ // Do the remaining two (least likely) prediction modes
2261+ switch( ly )
2262+ {
2263+ case 0: // DDL
2264+ satd = x264_predict_8x8_ddl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
2265+ atom_min( &cost_buf[lx], satd );
2266+ break;
2267+ case 1: // VL
2268+ satd = x264_predict_8x8_vl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
2269+ atom_min( &cost_buf[lx], satd );
2270+ break;
2271+ default:
2272+ break;
2273+ }
2274+ }
2275+ barrier( CLK_LOCAL_MEM_FENCE );
2276+
2277+ if( (0 == ly) && (gx < mb_width) )
2278+ {
2279+ fenc_intra_cost[gidy * mb_width + gx] = cost_buf[lx]+ 5*lambda;
2280+ }
2281+
2282+ // initialize the frame_stats[2] buffer for kernel sum_intra_cost().
2283+ if( gx < 2 && gy == 0 )
2284+ frame_stats[gx] = 0;
2285+#undef CACHE_STRIDE
2286+#undef BLOCK_OFFSET
2287+}
2288+
2289+
2290+
2291+/*
2292+ * parallel sum intra costs
2293+ *
2294+ * global launch dimensions: [256, mb_height]
2295+ */
2296+kernel void sum_intra_cost(
2297+ const global uint16_t *fenc_intra_cost,
2298+ const global uint16_t *inv_qscale_factor,
2299+ global int *fenc_row_satds,
2300+ global int *frame_stats,
2301+ int mb_width )
2302+{
2303+ int y = get_global_id( 1 );
2304+ int mb_height = get_global_size( 1 );
2305+
2306+ int row_satds = 0;
2307+ int cost_est = 0;
2308+ int cost_est_aq = 0;
2309+
2310+ for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
2311+ {
2312+ int mb_xy = x + y * mb_width;
2313+ int cost = fenc_intra_cost[mb_xy];
2314+ int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
2315+ int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
2316+
2317+ row_satds += cost_aq;
2318+ if( b_frame_score_mb )
2319+ {
2320+ cost_est += cost;
2321+ cost_est_aq += cost_aq;
2322+ }
2323+ }
2324+
2325+ local int buffer[256];
2326+ int x = get_global_id( 0 );
2327+
2328+ row_satds = parallel_sum( row_satds, x, buffer );
2329+ cost_est = parallel_sum( cost_est, x, buffer );
2330+ cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
2331+
2332+ if( get_global_id( 0 ) == 0 )
2333+ {
2334+ fenc_row_satds[y] = row_satds;
2335+ atomic_add( frame_stats + COST_EST, cost_est );
2336+ atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
2337+ }
2338+}
2339diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl/motionsearch.cl
2340--- /dev/null Thu Jan 01 00:00:00 1970 +0000
2341+++ b/common/opencl/motionsearch.cl Mon Aug 20 22:33:23 2012 -0500
2342@@ -0,0 +1,275 @@
2343+/* Hierarchical (iterative) OpenCL lowres motion search */
2344+
2345+inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
2346+{
2347+ /* edge macroblocks might not have a direct descendant, use nearest */
2348+ x = (x == mb_width-1) ? (x - (mb_width&1)) >> 1 : x >> 1;
2349+ y = (y == mb_height-1) ? (y - (mb_height&1)) >> 1 : y >> 1;
2350+ return (mb_width>>1) * y + x;
2351+}
2352+
2353+/* Four threads calculate an 8x8 SAD. Each does two rows */
2354+int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
2355+{
2356+ frefpos.y += idx << 1;
2357+ fencpos.y += idx << 1;
2358+ int cost = 0;
2359+ if( frefpos.x < 0 )
2360+ {
2361+ /* slow path when MV goes past right edge */
2362+ for( int y = 0; y < 2; y++ )
2363+ {
2364+ for( int x = 0; x < 8; x++ )
2365+ {
2366+ pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
2367+ pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
2368+ cost += abs_diff( enc, ref );
2369+ }
2370+ }
2371+ }
2372+ else
2373+ {
2374+ uint4 enc, ref, costs = 0;
2375+ enc = read_imageui( fenc, sampler, fencpos );
2376+ ref = read_imageui( fref, sampler, frefpos );
2377+ costs += abs_diff( enc, ref );
2378+ enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
2379+ ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
2380+ costs += abs_diff( enc, ref );
2381+ enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
2382+ ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
2383+ costs += abs_diff( enc, ref );
2384+ enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
2385+ ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
2386+ costs += abs_diff( enc, ref );
2387+ cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
2388+ }
2389+ costs[idx] = cost;
2390+ return costs[0] + costs[1] + costs[2] + costs[3];
2391+}
2392+
2393+/* One thread performs 8x8 SAD */
2394+int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
2395+{
2396+ if( frefpos.x < 0 )
2397+ {
2398+ /* slow path when MV goes past right edge */
2399+ int cost = 0;
2400+ for( int y = 0; y < 8; y++ )
2401+ {
2402+ for( int x = 0; x < 8; x++ )
2403+ {
2404+ uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
2405+ uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
2406+ cost += abs_diff( enc, ref );
2407+ }
2408+ }
2409+ return cost;
2410+ }
2411+ else
2412+ {
2413+ uint4 enc, ref, cost = 0;
2414+ for( int y = 0; y < 8; y++ )
2415+ {
2416+ for( int x = 0; x < 8; x += 4 )
2417+ {
2418+ enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
2419+ ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
2420+ cost += abs_diff( enc, ref );
2421+ }
2422+ }
2423+ return cost.s0 + cost.s1 + cost.s2 + cost.s3;
2424+ }
2425+}
2426+
2427+
2428+/*
2429+ * hierarchical motion estimation
2430+ *
2431+ * Each kernel launch is a single iteration
2432+ *
2433+ * MB per work group is determined by lclx / 4 * lcly
2434+ *
2435+ * global launch dimensions: [mb_width * 4, mb_height]
2436+ */
2437+kernel void hierarchical_motion(
2438+ read_only image2d_t fenc,
2439+ read_only image2d_t fref,
2440+ const global short2 *in_mvs,
2441+ global short2 *out_mvs,
2442+ global int16_t *out_mv_costs,
2443+ global short2 *mvp_buffer,
2444+ local int16_t *cost_local,
2445+ local short2 *mvc_local,
2446+ int mb_width,
2447+ int lambda,
2448+ int me_range,
2449+ int scale,
2450+ int b_shift_index,
2451+ int b_first_iteration,
2452+ int b_reverse_references )
2453+{
2454+ int mb_x = get_global_id( 0 ) >> 2;
2455+ if( mb_x >= mb_width )
2456+ return;
2457+ int mb_height = get_global_size( 1 );
2458+ int mb_i = get_global_id( 0 ) & 3;
2459+ int mb_y = get_global_id( 1 );
2460+ int mb_xy = mb_y * mb_width + mb_x;
2461+ const int mb_size = 8;
2462+ int2 coord;
2463+ coord.x = mb_x * mb_size;
2464+ coord.y = mb_y * mb_size;
2465+
2466+ const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
2467+ cost_local += 4 * mb_in_group;
2468+
2469+ int i_mvc = 0;
2470+ mvc_local += 4 * mb_in_group;
2471+ mvc_local[mb_i] = 0;
2472+ short2 mvp;
2473+
2474+ if( b_first_iteration )
2475+ {
2476+ mvp.x = 0;
2477+ mvp.y = 0;
2478+ }
2479+ else
2480+ {
2481+#define MVC( DX, DY )\
2482+ {\
2483+ int px = mb_x + DX;\
2484+ int py = mb_y + DY;\
2485+ if( b_shift_index )\
2486+ mvc_local[i_mvc] = in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )];\
2487+ else\
2488+ mvc_local[i_mvc] = in_mvs[mb_width * py + px];\
2489+ mvc_local[i_mvc].x >>= scale;\
2490+ mvc_local[i_mvc].y >>= scale;\
2491+ i_mvc++;\
2492+ }
2493+ /* Find MVP from median of MVCs */
2494+ if( b_reverse_references )
2495+ {
2496+ /* odd iterations: derive MVP from down and right */
2497+ if( mb_x < mb_width - 1 )
2498+ MVC( 1, 0 );
2499+ if( mb_y < mb_height - 1 )
2500+ {
2501+ MVC( 0, 1 );
2502+ if( mb_x > b_shift_index )
2503+ MVC( -1, 1 );
2504+ if( mb_x < mb_width - 1 )
2505+ MVC( 1, 1 );
2506+ }
2507+ }
2508+ else
2509+ {
2510+ /* even iterations: derive MVP from up and left */
2511+ if( mb_x > 0 )
2512+ MVC( -1, 0 );
2513+ if( mb_y > 0 )
2514+ {
2515+ MVC( 0, -1 );
2516+ if( mb_x < mb_width - 1 )
2517+ MVC( 1, -1 );
2518+ if( mb_x > b_shift_index )
2519+ MVC( -1, -1 );
2520+ }
2521+ }
2522+ if( i_mvc <= 1 )
2523+ {
2524+ mvp = mvc_local[0];
2525+ }
2526+ else
2527+ mvp = x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
2528+#undef MVC
2529+ }
2530+ //new mvp == old mvp, copy the input mv to the output mv and exit.
2531+ if( (!b_shift_index) && (mvp.x == mvp_buffer[mb_xy].x) && (mvp.y == mvp_buffer[mb_xy].y) )
2532+ {
2533+ out_mvs[mb_xy] = in_mvs[mb_xy];
2534+ return;
2535+ }
2536+ mvp_buffer[mb_xy] = mvp;
2537+ short2 mv_min;
2538+ short2 mv_max;
2539+ mv_min.x = -mb_size * mb_x - 4;
2540+ mv_max.x = mb_size * (mb_width - mb_x - 1) + 4;
2541+ mv_min.y = -mb_size * mb_y - 4;
2542+ mv_max.y = mb_size * (mb_height - mb_y - 1) + 4;
2543+
2544+ short2 bestmv;
2545+ bestmv.x = x264_clip3( mvp.x, mv_min.x, mv_max.x );
2546+ bestmv.y = x264_clip3( mvp.y, mv_min.y, mv_max.y );
2547+
2548+ int2 refcrd;
2549+ refcrd.x = coord.x + bestmv.x;
2550+ refcrd.y = coord.y + bestmv.y;
2551+ /* measure cost at bestmv */
2552+ int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
2553+ lambda * calc_mv_cost( abs_diff( bestmv.x, mvp.x ) << (2 + scale), abs_diff( bestmv.y, mvp.y ) << (2 + scale) );
2554+
2555+ do
2556+ {
2557+ /* measure costs at offsets from bestmv */
2558+ refcrd.x = coord.x + bestmv.x + dia_offs[mb_i].x;
2559+ refcrd.y = coord.y + bestmv.y + dia_offs[mb_i].y;
2560+ short2 trymv = bestmv + dia_offs[mb_i];
2561+ int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
2562+ lambda * calc_mv_cost( abs_diff( trymv.x, mvp.x ) << (2 + scale), abs_diff( trymv.y, mvp.y ) << (2 + scale) );
2563+
2564+ cost_local[mb_i] = (cost<<2) | mb_i;
2565+ cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );
2566+
2567+ if( (cost >> 2) >= bcost )
2568+ break;
2569+
2570+ bestmv += dia_offs[cost&3];
2571+ bcost = cost>>2;
2572+
2573+ if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y )
2574+ break;
2575+ }
2576+ while( --me_range > 0 );
2577+
2578+ short2 trymv;
2579+
2580+#define COST_MV_NO_PAD( X, Y, L )\
2581+ trymv.x = x264_clip3( X, mv_min.x, mv_max.x );\
2582+ trymv.y = x264_clip3( Y, mv_min.y, mv_max.y );\
2583+ if( abs_diff( mvp.x, trymv.x ) > 1 || abs_diff( mvp.y, trymv.y ) > 1 ) {\
2584+ int2 refcrd = coord; refcrd.x += trymv.x; refcrd.y += trymv.y;\
2585+ int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\
2586+ L * calc_mv_cost( abs_diff( trymv.x, mvp.x ) << (2 + scale), abs_diff( trymv.y, mvp.y ) << (2 + scale) );\
2587+ if( cost < bcost ) { bcost = cost; bestmv = trymv; } }
2588+
2589+ COST_MV_NO_PAD( 0, 0, 0 );
2590+
2591+ if( !b_first_iteration )
2592+ {
2593+ /* try cost at previous iteration's MV, if MVP was too far away */
2594+ short2 prevmv;
2595+ if( b_shift_index )
2596+ prevmv = in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )];
2597+ else
2598+ prevmv = in_mvs[mb_xy];
2599+ prevmv.x >>= scale;
2600+ prevmv.y >>= scale;
2601+ COST_MV_NO_PAD( prevmv.x, prevmv.y, lambda );
2602+ }
2603+
2604+ for( int i = 0; i < i_mvc; i++ )
2605+ {
2606+ /* try cost at each candidate MV, if MVP was too far away */
2607+ COST_MV_NO_PAD( mvc_local[i].x, mvc_local[i].y, lambda );
2608+ }
2609+
2610+ if( mb_i == 0 )
2611+ {
2612+ bestmv.x <<= scale;
2613+ bestmv.y <<= scale;
2614+ out_mvs[mb_xy] = bestmv;
2615+ out_mv_costs[mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK );
2616+ }
2617+}
2618diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl/subpel.cl
2619--- /dev/null Thu Jan 01 00:00:00 1970 +0000
2620+++ b/common/opencl/subpel.cl Mon Aug 20 22:33:23 2012 -0500
2621@@ -0,0 +1,288 @@
2622+/* OpenCL lowres subpel Refine */
2623+
2624+/* One thread performs 8x8 SAD */
2625+int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
2626+{
2627+ int2 frefpos = (int2)(qpos.x >> 2, qpos.y >> 2);
2628+ int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2);
2629+ uint mask_shift = 8 * hpel_idx;
2630+
2631+ uint4 cost4 = 0;
2632+
2633+ for( int y = 0; y < 8; y++ )
2634+ {
2635+ uint4 enc, val4;
2636+ enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y));
2637+ val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 >> mask_shift) & 0xFF;
2638+ val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 >> mask_shift) & 0xFF;
2639+ val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 >> mask_shift) & 0xFF;
2640+ val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 >> mask_shift) & 0xFF;
2641+ cost4 += abs_diff( enc, val4 );
2642+
2643+ enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y));
2644+ val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 >> mask_shift) & 0xFF;
2645+ val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 >> mask_shift) & 0xFF;
2646+ val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 >> mask_shift) & 0xFF;
2647+ val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 >> mask_shift) & 0xFF;
2648+ cost4 += abs_diff( enc, val4 );
2649+ }
2650+
2651+ return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3;
2652+}
2653+
2654+/* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */
2655+int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
2656+{
2657+ int2 frefApos = qpos >> 2;
2658+ int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2);
2659+
2660+ int2 qposB = qpos + ((qpos & 1) << 1);
2661+ int2 frefBpos = qposB >> 2;
2662+ int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2);
2663+
2664+ uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
2665+
2666+ int cost = 0;
2667+
2668+ for( int y = 0; y < 8; y++ )
2669+ {
2670+ for( int x = 0; x < 8; x++ )
2671+ {
2672+ uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0;
2673+ uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 >> mask_shift0) & 0xFF;
2674+ uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 >> mask_shift1) & 0xFF;
2675+ uint ref = rhadd( vA, vB );
2676+ cost += abs_diff( enc, ref );
2677+ }
2678+ }
2679+
2680+ return cost;
2681+}
2682+
2683+/* Four threads measure 8x8 SATD cost at a QPEL offset into an HPEL plane
2684+ *
2685+ * Each thread collects 1/4 of the rows of diffs and processes one quarter of
2686+ * the transforms
2687+ */
2688+int satd_8x8_ii_qpel_coop4(
2689+ read_only image2d_t fenc, int2 fencpos,
2690+ read_only image2d_t fref_planes, int2 qpos,
2691+ local sum2_t *tmpp,
2692+ int idx )
2693+{
2694+ volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
2695+ sum2_t b0, b1, b2, b3;
2696+
2697+ // fencpos is full-pel position of original MB
2698+ // qpos is qpel position within reference frame
2699+ int2 frefApos = (int2)(qpos.x>>2, qpos.y>>2);
2700+ int hpelA = ((qpos.x&2)>>1) + (qpos.y&2);
2701+
2702+ int2 qposB = (int2)qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1));
2703+ int2 frefBpos = (int2)(qposB.x>>2, qposB.y>>2);
2704+ int hpelB = ((qposB.x&2)>>1) + (qposB.y&2);
2705+
2706+ uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
2707+
2708+ uint vA, vB;
2709+ uint a0, a1;
2710+ uint enc;
2711+ sum2_t sum = 0;
2712+
2713+
2714+#define READ_DIFF( OUT, X )\
2715+ enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\
2716+ vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 >> mask_shift0) & 0xFF;\
2717+ vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 >> mask_shift1) & 0xFF;\
2718+ OUT = enc - rhadd( vA, vB );
2719+
2720+#define READ_DIFF_EX( OUT, a, b )\
2721+ {\
2722+ READ_DIFF( a0, a );\
2723+ READ_DIFF( a1, b );\
2724+ OUT = a0 + (a1<<BITS_PER_SUM);\
2725+ }
2726+#define ROW_8x4_SATD( a, b )\
2727+ {\
2728+ fencpos.y += a;\
2729+ frefApos.y += b;\
2730+ frefBpos.y += b;\
2731+ READ_DIFF_EX( b0, 0, 4 );\
2732+ READ_DIFF_EX( b1, 1, 5 );\
2733+ READ_DIFF_EX( b2, 2, 6 );\
2734+ READ_DIFF_EX( b3, 3, 7 );\
2735+ HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
2736+ HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
2737+ sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\
2738+ }
2739+ ROW_8x4_SATD( 0, 0 );
2740+ ROW_8x4_SATD( 4, 4 );
2741+
2742+#undef READ_DIFF
2743+#undef READ_DIFF_EX
2744+#undef ROW_8x4_SATD
2745+ return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
2746+}
2747+
2748+constant short2 hpoffs[4] =
2749+{
2750+ {0, -2}, {-2, 0}, {2, 0}, {0, 2}
2751+};
2752+
2753+/*
2754+ * sub pixel refinement of motion vectors, output MVs and costs are moved from
2755+ * temporary buffers into final per-frame buffer
2756+ *
2757+ * global launch dimensions: [mb_width * 4, mb_height]
2758+ *
2759+ * With X being the source 16x16 pixels, F is the lowres pixel used by the
2760+ * motion search. We will now utilize the H V and C pixels (stored in separate
2761+ * planes) to search at half-pel increments.
2762+ *
2763+ * X X X X X X
2764+ * F H F H F
2765+ * X X X X X X
2766+ * V C V C V
2767+ * X X X X X X
2768+ * F H F H F
2769+ * X X X X X X
2770+ *
2771+ * The YX HPEL bits of the motion vector selects the plane we search in. The
2772+ * four planes are packed in the fref_planes 2D image buffer. Each sample
2773+ * returns: s0 = F, s1 = H, s2 = V, s3 = C
2774+ */
2775+kernel void subpel_refine(
2776+ read_only image2d_t fenc,
2777+ read_only image2d_t fref_planes,
2778+ const global short2 *in_mvs,
2779+ const global int16_t *in_sad_mv_costs,
2780+ local int16_t *cost_local,
2781+ local sum2_t *satd_local,
2782+ local short2 *mvc_local,
2783+ global short2 *fenc_lowres_mv,
2784+ global int16_t *fenc_lowres_mv_costs,
2785+ int mb_width,
2786+ int lambda,
2787+ int b,
2788+ int ref,
2789+ int b_islist1 )
2790+{
2791+ int mb_x = get_global_id( 0 ) >> 2;
2792+ if( mb_x >= mb_width )
2793+ return;
2794+ int mb_height = get_global_size( 1 );
2795+
2796+ int mb_i = get_global_id( 0 ) & 3;
2797+ int mb_y = get_global_id( 1 );
2798+ int mb_xy = mb_y * mb_width + mb_x;
2799+
2800+ /* fenc_lowres_mv and fenc_lowres_mv_costs are large buffers that
2801+ * hold many frames worth of motion vectors. We must offset into the correct
2802+ * location for this frame's vectors. The kernel will be passed the correct
2803+ * directional buffer for the direction of the search: list1 or list0
2804+ *
2805+ * CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
2806+ * GPU equivalent: fenc_lowres_mvs[(b - p0 - 1) * mb_count]
2807+ */
2808+ fenc_lowres_mv += ( b_islist1 ? ( ref - b - 1 ) : ( b - ref - 1 ) ) * mb_width * mb_height;
2809+ fenc_lowres_mv_costs += ( b_islist1 ? ( ref - b - 1 ) : ( b - ref - 1 ) ) * mb_width * mb_height;
2810+
2811+ int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
2812+ cost_local += mb_in_group * 4;
2813+ satd_local += mb_in_group * 16;
2814+ mvc_local += mb_in_group * 4;
2815+
2816+ int i_mvc = 0;
2817+ mvc_local[0] = 0;
2818+ mvc_local[1] = 0;
2819+ mvc_local[2] = 0;
2820+ mvc_local[3] = 0;
2821+ short2 mvp;
2822+#define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)];
2823+ if( mb_x > 0 )
2824+ MVC( -1, 0 );
2825+ if( mb_y > 0 )
2826+ {
2827+ MVC( 0, -1 );
2828+ if( mb_x < mb_width - 1 )
2829+ MVC( 1, -1 );
2830+ if( mb_x > 0 )
2831+ MVC( -1, -1 );
2832+ }
2833+ if( i_mvc <= 1 )
2834+ mvp = mvc_local[0];
2835+ else
2836+ mvp = x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
2837+#undef MVC
2838+
2839+ int bcost = in_sad_mv_costs[mb_xy];
2840+ short2 bmv = in_mvs[mb_xy];
2841+
2842+ /* Make mvp a QPEL MV */
2843+ mvp.x <<= 2;
2844+ mvp.y <<= 2;
2845+
2846+ /* Make bmv a QPEL MV */
2847+ bmv.x <<= 2;
2848+ bmv.y <<= 2;
2849+
2850+ int2 coord;
2851+ coord.x = mb_x << 3;
2852+ coord.y = mb_y << 3;
2853+ short2 trymv;
2854+ int2 qpos;
2855+ int cost = 0;
2856+
2857+#define HPEL_QPEL( ARR, FUNC )\
2858+ {\
2859+ trymv = bmv + ARR[mb_i];\
2860+ qpos.x = (coord.x << 2) + trymv.x;\
2861+ qpos.y = (coord.y << 2) + trymv.y;\
2862+ cost = FUNC( fenc, coord, fref_planes, qpos ) + lambda * calc_mv_cost( abs_diff( trymv.x, mvp.x ), abs_diff( trymv.y, mvp.y ) );\
2863+ cost_local[mb_i] = (cost<<2) + mb_i;\
2864+ cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );\
2865+ if( (cost>>2) < bcost )\
2866+ {\
2867+ bmv += ARR[cost&3];\
2868+ bcost = cost>>2;\
2869+ }\
2870+ }
2871+
2872+ HPEL_QPEL( hpoffs, sad_8x8_ii_hpel );
2873+
2874+#define CHEAP_QPEL 1
2875+
2876+#if CHEAP_QPEL
2877+ HPEL_QPEL( dia_offs, sad_8x8_ii_qpel );
2878+#endif
2879+
2880+ /* remeasure with SATD */
2881+ qpos.x = (coord.x << 2) + bmv.x;
2882+ qpos.y = (coord.y << 2) + bmv.y;
2883+ cost_local[mb_i] = satd_8x8_ii_qpel_coop4( fenc, coord, fref_planes, qpos, satd_local, mb_i );
2884+ bcost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];
2885+ bcost += lambda * calc_mv_cost( abs_diff( bmv.x, mvp.x ), abs_diff( bmv.y, mvp.y ) );
2886+ short2 bestmv = 0;
2887+
2888+#if !CHEAP_QPEL
2889+#define QPEL_SATD( ARR, FUNC )\
2890+ {\
2891+ trymv = bmv + ARR;\
2892+ qpos.x = (coord.x << 2) + trymv.x;\
2893+ qpos.y = (coord.y << 2) + trymv.y;\
2894+ cost_local[mb_i] = FUNC( fenc, coord, fref_planes, qpos, satd_local, mb_i );\
2895+ cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
2896+ cost += lambda * calc_mv_cost( abs_diff( trymv.x, mvp.x ), abs_diff( trymv.y, mvp.y ) );\
2897+ if( cost < bcost )\
2898+ {\
2899+ bestmv = ARR;\
2900+ bcost = cost;\
2901+ }\
2902+ }
2903+ for( int i = 0; i<4; i++ )
2904+ QPEL_SATD( dia_offs[i], satd_8x8_ii_qpel_coop4 );
2905+#endif
2906+
2907+ fenc_lowres_mv[mb_xy] = bmv+bestmv;
2908+ fenc_lowres_mv_costs[mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK );
2909+}
2910diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl/weightp.cl
2911--- /dev/null Thu Jan 01 00:00:00 1970 +0000
2912+++ b/common/opencl/weightp.cl Mon Aug 20 22:33:23 2012 -0500
2913@@ -0,0 +1,48 @@
2914+/* Weightp filter a downscaled image into a temporary output buffer.
2915+ * This kernel is launched for onced for each scale.
2916+ *
2917+ * Launch dimensions: width x height (in pixels)
2918+ */
2919+kernel void weightp_scaled_images( read_only image2d_t in_plane,
2920+ write_only image2d_t out_plane,
2921+ uint offset,
2922+ uint scale,
2923+ uint denom )
2924+{
2925+ int gx = get_global_id( 0 );
2926+ int gy = get_global_id( 1 );
2927+ uint4 input_val;
2928+ uint4 output_val;
2929+
2930+ input_val = read_imageui( in_plane, sampler, (int2)(gx, gy));
2931+ output_val = (uint4)(offset) + ( ( ((uint4)(scale)) * input_val ) >> ((uint4)(denom)) );
2932+ write_imageui( out_plane, (int2)(gx, gy), output_val );
2933+}
2934+
2935+/* Weightp filter for the half-pel interpolated image
2936+ *
2937+ * Launch dimensions: width x height (in pixels)
2938+ */
2939+kernel void weightp_hpel( read_only image2d_t in_plane,
2940+ write_only image2d_t out_plane,
2941+ uint offset,
2942+ uint scale,
2943+ uint denom )
2944+{
2945+ int gx = get_global_id( 0 );
2946+ int gy = get_global_id( 1 );
2947+ uint input_val;
2948+ uint output_val;
2949+
2950+ input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)).s0;
2951+ //Unpack
2952+ uint4 temp;
2953+ temp.s0 = input_val & 0x00ff; temp.s1 = (input_val >> 8) & 0x00ff;
2954+ temp.s2 = (input_val >> 16) & 0x00ff; temp.s3 = (input_val >> 24) & 0x00ff;
2955+
2956+ temp = (uint4)(offset) + ( ( ((uint4)(scale)) * temp ) >> ((uint4)(denom)) );
2957+
2958+ //Pack
2959+ output_val = temp.s0 | (temp.s1 << 8) | (temp.s2 << 16) | (temp.s3 << 24);
2960+ write_imageui( out_plane, (int2)(gx, gy), output_val );
2961+}
2962diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl/x264-cl.h
2963--- /dev/null Thu Jan 01 00:00:00 1970 +0000
2964+++ b/common/opencl/x264-cl.h Mon Aug 20 22:33:23 2012 -0500
2965@@ -0,0 +1,161 @@
2966+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
2967+
2968+constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
2969+
2970+/* 7.18.1.1 Exact-width integer types */
2971+typedef signed char int8_t;
2972+typedef unsigned char uint8_t;
2973+typedef short int16_t;
2974+typedef unsigned short uint16_t;
2975+typedef int int32_t;
2976+typedef unsigned uint32_t;
2977+typedef long int64_t;
2978+typedef unsigned long uint64_t;
2979+
2980+typedef union
2981+{
2982+ uint32_t i;
2983+ uint16_t b[2];
2984+ uint8_t c[4];
2985+} x264_union32_t;
2986+
2987+typedef uint8_t pixel;
2988+typedef uint16_t sum_t;
2989+typedef uint32_t sum2_t;
2990+
2991+#define LOWRES_COST_MASK ((1<<14)-1)
2992+#define LOWRES_COST_SHIFT 14
2993+#define COST_MAX (1<<28)
2994+
2995+#define X264_MIN( a, b ) ((a)<(b) ? (a) : (b))
2996+
2997+#define PIXEL_MAX 255
2998+#define BITS_PER_SUM (8 * sizeof(sum_t))
2999+
3000+/* Constants for offsets into frame statistics buffer */
3001+#define COST_EST 0
3002+#define COST_EST_AQ 1
3003+#define INTRA_MBS 2
3004+
3005+#define COPY2_IF_LT( x, y, a, b )\
3006+ if((y)<(x))\
3007+ {\
3008+ (x) = (y);\
3009+ (a) = (b);\
3010+ }
3011+constant short2 dia_offs[4] =
3012+{
3013+ {0, -1}, {-1, 0}, {1, 0}, {0, 1},
3014+};
3015+
3016+inline pixel x264_clip_pixel( int x )
3017+{
3018+ return ((x & ~PIXEL_MAX) ? (-x) >> 31 & PIXEL_MAX : x);
3019+}
3020+
3021+inline int x264_clip3( int v, int i_min, int i_max )
3022+{
3023+ return ((v < i_min) ? i_min : (v > i_max) ? i_max : v);
3024+}
3025+
3026+inline int x264_median( int a, int b, int c )
3027+{
3028+ int t = (a - b) & ((a - b) >> 31);
3029+ a -= t;
3030+ b += t;
3031+ b -= (b - c) & ((b - c) >> 31);
3032+ b += (a - b) & ((a - b) >> 31);
3033+ return b;
3034+}
3035+
3036+inline short2 x264_median_mv( short2 a, short2 b, short2 c )
3037+{
3038+ short2 dst;
3039+ dst.x = x264_median( a.x, b.x, c.x );
3040+ dst.y = x264_median( a.y, b.y, c.y );
3041+ return dst;
3042+}
3043+
3044+inline sum2_t abs2( sum2_t a )
3045+{
3046+ sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
3047+ return (a + s) ^ s;
3048+}
3049+
3050+#define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
3051+ sum2_t t0 = s0 + s1;\
3052+ sum2_t t1 = s0 - s1;\
3053+ sum2_t t2 = s2 + s3;\
3054+ sum2_t t3 = s2 - s3;\
3055+ d0 = t0 + t2;\
3056+ d2 = t0 - t2;\
3057+ d1 = t1 + t3;\
3058+ d3 = t1 - t3;\
3059+}
3060+
3061+#define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
3062+ int2 t0 = s0 + s1;\
3063+ int2 t1 = s0 - s1;\
3064+ int2 t2 = s2 + s3;\
3065+ int2 t3 = s2 - s3;\
3066+ d0 = t0 + t2;\
3067+ d2 = t0 - t2;\
3068+ d1 = t1 + t3;\
3069+ d3 = t1 - t3;\
3070+}
3071+
3072+#define SATD_C_8x4_Q( name, q1, q2 )\
3073+ int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\
3074+ {\
3075+ sum2_t tmp[4][4];\
3076+ sum2_t a0, a1, a2, a3;\
3077+ sum2_t sum = 0;\
3078+ for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\
3079+ {\
3080+ a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\
3081+ a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\
3082+ a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\
3083+ a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\
3084+ HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\
3085+ }\
3086+ for( int i = 0; i < 4; i++ )\
3087+ {\
3088+ HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\
3089+ sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\
3090+ }\
3091+ return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\
3092+ }
3093+
3094+/*
3095+ * Utility function to perform a parallel sum reduction of an array of integers
3096+ */
3097+int parallel_sum( int value, int x, volatile local int *array )
3098+{
3099+ array[x] = value;
3100+ barrier( CLK_LOCAL_MEM_FENCE );
3101+
3102+ int dim = get_local_size( 0 );
3103+
3104+ while( dim > 1 )
3105+ {
3106+ dim >>= 1;
3107+
3108+ if( x < dim )
3109+ {
3110+ array[x] += array[x + dim];
3111+ }
3112+
3113+ if( dim > 32 )
3114+ {
3115+ barrier( CLK_LOCAL_MEM_FENCE );
3116+ }
3117+ }
3118+
3119+ return array[0];
3120+}
3121+
3122+int calc_mv_cost( int dx, int dy )
3123+{
3124+ return round( (log2( (float)(dx + 1) ) * 2.0f + 0.718f + !!dx) ) +
3125+ round( (log2( (float)(dy + 1) ) * 2.0f + 0.718f + !!dy) );
3126+}
3127diff -r bdffc2c1e85b -r e5e4a79d3f21 configure
3128--- a/configure Wed Jul 18 08:33:41 2012 -0700
3129+++ b/configure Mon Aug 20 22:33:23 2012 -0500
3130@@ -25,6 +25,7 @@
3131 --system-libx264 use system libx264 instead of internal
3132 --enable-shared build shared library
3133 --enable-static build static library
3134+ --disable-opencl disable OpenCL features
3135 --disable-gpl disable GPL-only features
3136 --disable-thread disable multithreaded encoding
3137 --enable-win32thread use win32threads (windows only)
3138@@ -273,6 +274,7 @@
3139 bit_depth="8"
3140 chroma_format="all"
3141 compiler="GNU"
3142+opencl="yes"
3143
3144 CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)"
3145 LDFLAGS="$LDFLAGS"
3146@@ -381,6 +383,9 @@
3147 --host=*)
3148 host="$optarg"
3149 ;;
3150+ --disable-opencl)
3151+ opencl="no"
3152+ ;;
3153 --cross-prefix=*)
3154 cross_prefix="$optarg"
3155 ;;
3156@@ -1082,6 +1087,37 @@
3157 PROF_USE_LD=$PROF_USE_LD
3158 EOF
3159
3160+if [ "$opencl" = "yes" ]; then
3161+ log_check "looking for xxd"
3162+ if ! $(xxd -v 2>/dev/null); then
3163+ echo 'OpenCL support requires xxd to compile.'
3164+ echo 'use --disable-opencl to compile without OpenCL'
3165+ exit 1
3166+ elif [ "$CUDA_PATH" != "" ]; then
3167+ echo 'HAVE_OPENCL=yes' >> config.mak
3168+ echo 'OPENCL_LIB=OpenCL' >> config.mak
3169+ echo 'OPENCL_INC_DIR=$(CUDA_PATH)include' >> config.mak
3170+ if [ "$ARCH" = "X86" ]; then
3171+ echo 'OPENCL_LIB_DIR=$(CUDA_PATH)lib/Win32' >> config.mak
3172+ else
3173+ echo 'OPENCL_LIB_DIR=$(CUDA_PATH)lib/x64' >> config.mak
3174+ fi
3175+ define HAVE_OPENCL
3176+ elif [ "$AMDAPPSDKROOT" != "" ]; then
3177+ echo 'HAVE_OPENCL=yes' >> config.mak
3178+ echo 'OPENCL_LIB=OpenCL' >> config.mak
3179+ echo 'OPENCL_INC_DIR=$(AMDAPPSDKROOT)/include' >> config.mak
3180+ if [ "$ARCH" = "X86" ]; then
3181+ echo 'OPENCL_LIB_DIR=$(AMDAPPSDKROOT)/lib/x86' >> config.mak
3182+ else
3183+ echo 'OPENCL_LIB_DIR=$(AMDAPPSDKROOT)/lib/x86_64' >> config.mak
3184+ fi
3185+ define HAVE_OPENCL
3186+ else
3187+ opencl="no"
3188+ fi
3189+fi
3190+
3191 if [ $compiler = ICL ]; then
3192 echo '%.o: %.c' >> config.mak
3193 echo ' $(CC) $(CFLAGS) -c -Fo$@ $<' >> config.mak
3194@@ -1192,6 +1228,7 @@
3195 visualize: $vis
3196 bit depth: $bit_depth
3197 chroma format: $chroma_format
3198+opencl: $opencl
3199 EOF
3200
3201 echo >> config.log
3202diff -r bdffc2c1e85b -r e5e4a79d3f21 encoder/encoder.c
3203--- a/encoder/encoder.c Wed Jul 18 08:33:41 2012 -0700
3204+++ b/encoder/encoder.c Mon Aug 20 22:33:23 2012 -0500
3205@@ -37,6 +37,10 @@
3206 #include "common/visualize.h"
3207 #endif
3208
3209+#if HAVE_OPENCL
3210+#include "common/opencl.h"
3211+#endif
3212+
3213 //#define DEBUG_MB_TYPE
3214
3215 #define bs_write_ue bs_write_ue_big
3216@@ -528,6 +532,20 @@
3217 if( h->i_thread_frames > 1 )
3218 h->param.nalu_process = NULL;
3219
3220+#if !HAVE_OPENCL
3221+ if( h->param.b_opencl )
3222+ {
3223+ x264_log( h, X264_LOG_WARNING, "not compiled with OpenCL support\n" );
3224+ h->param.b_opencl = 0;
3225+ }
3226+#elif BIT_DEPTH > 8
3227+ if( h->param.b_opencl )
3228+ {
3229+ x264_log( h, X264_LOG_WARNING, "OpenCL lookahead does not support high bit depth, disabling opencl\n" );
3230+ h->param.b_opencl = 0;
3231+ }
3232+#endif
3233+
3234 h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE );
3235 if( h->param.i_keyint_max == 1 )
3236 {
3237@@ -1338,6 +1356,11 @@
3238 goto fail;
3239 }
3240
3241+#if HAVE_OPENCL
3242+ if( h->param.b_opencl && x264_opencl_init( h ) < 0 )
3243+ h->param.b_opencl = 0;
3244+#endif
3245+
3246 if( x264_lookahead_init( h, i_slicetype_length ) )
3247 goto fail;
3248
3249@@ -3483,6 +3506,10 @@
3250 || h->stat.i_mb_count[SLICE_TYPE_P][I_PCM]
3251 || h->stat.i_mb_count[SLICE_TYPE_B][I_PCM];
3252
3253+#if HAVE_OPENCL
3254+ x264_opencl_free( h );
3255+#endif
3256+
3257 x264_lookahead_delete( h );
3258
3259 if( h->param.b_sliced_threads )
3260diff -r bdffc2c1e85b -r e5e4a79d3f21 encoder/slicetype-cl.c
3261--- /dev/null Thu Jan 01 00:00:00 1970 +0000
3262+++ b/encoder/slicetype-cl.c Mon Aug 20 22:33:23 2012 -0500
3263@@ -0,0 +1,681 @@
3264+/*****************************************************************************
3265+ * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
3266+ *****************************************************************************
3267+ * Copyright (C) 2012 x264 project
3268+ *
3269+ * Authors: Steve Borho <sborho@multicorewareinc.com>
3270+ *
3271+ * This program is free software; you can redistribute it and/or modify
3272+ * it under the terms of the GNU General Public License as published by
3273+ * the Free Software Foundation; either version 2 of the License, or
3274+ * (at your option) any later version.
3275+ *
3276+ * This program is distributed in the hope that it will be useful,
3277+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
3278+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3279+ * GNU General Public License for more details.
3280+ *
3281+ * You should have received a copy of the GNU General Public License
3282+ * along with this program; if not, write to the Free Software
3283+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
3284+ *
3285+ * This program is also available under a commercial proprietary license.
3286+ * For more information, contact us at licensing@x264.com.
3287+ *****************************************************************************/
3288+
3289+#include "common/common.h"
3290+#include "macroblock.h"
3291+#include "me.h"
3292+
3293+#if HAVE_OPENCL
3294+
3295+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
3296+
3297+#define OCLCHECK( method, ... )\
3298+ status = method( __VA_ARGS__ ); if( status != CL_SUCCESS ) { h->param.b_opencl = 0;\
3299+ x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status ); return status; }
3300+
3301+void x264_opencl_flush( x264_t *h )
3302+{
3303+ clFinish( h->opencl.queue );
3304+
3305+ /* Finish copies from the GPU by copying from the page-locked buffer to
3306+ * their final destination
3307+ */
3308+ for( int i = 0; i < h->opencl.num_copies; i++ )
3309+ memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
3310+ h->opencl.num_copies = 0;
3311+ h->opencl.pl_occupancy = 0;
3312+}
3313+
3314+static void *x264_opencl_alloc_locked( x264_t *h, int bytes )
3315+{
3316+ if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
3317+ x264_opencl_flush( h );
3318+ assert( bytes < PAGE_LOCKED_BUF_SIZE );
3319+ char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
3320+ h->opencl.pl_occupancy += bytes;
3321+ return ptr;
3322+}
3323+
3324+int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
3325+{
3326+ if( fenc->b_intra_calculated )
3327+ return 0;
3328+ fenc->b_intra_calculated = 1;
3329+
3330+ int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
3331+
3332+#define CREATEBUF( out, flags, size )\
3333+ out = clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
3334+ if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
3335+#define CREATEIMAGE( out, flags, pf, width, height )\
3336+ out = clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
3337+ if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
3338+
3339+ int mb_count = h->mb.i_mb_count;
3340+ cl_int status;
3341+
3342+ if( !h->opencl.lowres_mv_costs )
3343+ {
3344+ /* Allocate shared memory buffers */
3345+ int width = h->mb.i_mb_width * 8 * sizeof(pixel);
3346+ int height = h->mb.i_mb_height * 8 * sizeof(pixel);
3347+
3348+ cl_image_format pixel_format;
3349+ pixel_format.image_channel_order = CL_R;
3350+ pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
3351+ CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
3352+
3353+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
3354+ {
3355+ pixel_format.image_channel_order = CL_RGBA;
3356+ pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
3357+ CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
3358+ width >>= 1;
3359+ height >>= 1;
3360+ }
3361+
3362+ CREATEBUF( h->opencl.lowres_mv_costs, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
3363+ CREATEBUF( h->opencl.lowres_costs[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
3364+ CREATEBUF( h->opencl.lowres_costs[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
3365+ CREATEBUF( h->opencl.mv_buffers[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
3366+ CREATEBUF( h->opencl.mv_buffers[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
3367+ CREATEBUF( h->opencl.mvp_buffer, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
3368+ CREATEBUF( h->opencl.frame_stats[0], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
3369+ CREATEBUF( h->opencl.frame_stats[1], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
3370+ CREATEBUF( h->opencl.row_satds[0], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
3371+ CREATEBUF( h->opencl.row_satds[1], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
3372+ CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY, luma_length );
3373+ CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY, luma_length );
3374+ }
3375+
3376+ if( !fenc->opencl.intra_cost )
3377+ {
3378+ /* Allocate per-frame buffers */
3379+ int width = h->mb.i_mb_width * 8 * sizeof(pixel);
3380+ int height = h->mb.i_mb_height * 8 * sizeof(pixel);
3381+
3382+ cl_image_format pixel_format;
3383+ pixel_format.image_channel_order = CL_R;
3384+ pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
3385+ CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
3386+
3387+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
3388+ {
3389+ pixel_format.image_channel_order = CL_RGBA;
3390+ pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
3391+ CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
3392+ width >>= 1;
3393+ height >>= 1;
3394+ }
3395+ CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY, mb_count * sizeof(int16_t) );
3396+ CREATEBUF( fenc->opencl.intra_cost, CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
3397+ CREATEBUF( fenc->opencl.lowres_mvs0, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
3398+ CREATEBUF( fenc->opencl.lowres_mvs1, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
3399+ CREATEBUF( fenc->opencl.lowres_mv_costs0, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
3400+ CREATEBUF( fenc->opencl.lowres_mv_costs1, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
3401+ }
3402+#undef CREATEBUF
3403+
3404+ /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
3405+
3406+ char *locked = x264_opencl_alloc_locked( h, luma_length );
3407+ memcpy( locked, fenc->plane[0], luma_length );
3408+ OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
3409+
3410+ size_t gdim[2];
3411+ if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
3412+ {
3413+ int size = h->mb.i_mb_count * sizeof(int16_t);
3414+ locked = x264_opencl_alloc_locked( h, size );
3415+ memcpy( locked, fenc->i_inv_qscale_factor, size );
3416+ OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
3417+ }
3418+ else
3419+ {
3420+ /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
3421+ cl_uint arg = 0;
3422+ int16_t value = 256;
3423+ OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
3424+ OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
3425+ gdim[0] = h->mb.i_mb_count;
3426+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
3427+ }
3428+
3429+ int stride = fenc->i_stride[0];
3430+ cl_uint arg = 0;
3431+ OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
3432+ OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
3433+ OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
3434+ OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
3435+ gdim[0] = 8 * h->mb.i_mb_width;
3436+ gdim[1] = 8 * h->mb.i_mb_height;
3437+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
3438+
3439+ for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
3440+ {
3441+ /* Workaround for AMD Southern Island:
3442+ *
3443+ * Alternate kernel instances. No perf impact to this, so we do it for
3444+ * all GPUs. It prevents the same kernel from being enqueued
3445+ * back-to-back, avoiding a dependency calculation bug in the driver.
3446+ */
3447+ cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
3448+
3449+ arg = 0;
3450+ OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
3451+ OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
3452+ gdim[0] >>= 1;
3453+ gdim[1] >>= 1;
3454+ if( gdim[0] < 16 || gdim[1] < 16 )
3455+ break;
3456+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
3457+ }
3458+
3459+ size_t ldim[2];
3460+ gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
3461+ gdim[1] = 8*h->mb.i_mb_height;
3462+ ldim[0] = 32;
3463+ ldim[1] = 8;
3464+ arg = 0;
3465+
3466+ /* For presets slow, slower, and placebo, check all 10 intra modes that the
3467+ * C lookahead supports. For faster presets, only check the most frequent 8
3468+ * modes
3469+ */
3470+ int slow = h->param.analyse.i_subpel_refine > 7;
3471+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
3472+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
3473+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
3474+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
3475+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
3476+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
3477+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
3478+
3479+ gdim[0] = 256;
3480+ gdim[1] = h->mb.i_mb_height;
3481+ ldim[0] = 256;
3482+ ldim[1] = 1;
3483+ arg = 0;
3484+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
3485+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
3486+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
3487+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
3488+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
3489+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
3490+
3491+ if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
3492+ x264_opencl_flush( h );
3493+
3494+ int size = h->mb.i_mb_count * sizeof(int16_t);
3495+ locked = x264_opencl_alloc_locked( h, size );
3496+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
3497+ h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
3498+ h->opencl.copies[h->opencl.num_copies].src = locked;
3499+ h->opencl.copies[h->opencl.num_copies].bytes = size;
3500+ h->opencl.num_copies++;
3501+
3502+ size = h->mb.i_mb_height * sizeof(int);
3503+ locked = x264_opencl_alloc_locked( h, size );
3504+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
3505+ h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
3506+ h->opencl.copies[h->opencl.num_copies].src = locked;
3507+ h->opencl.copies[h->opencl.num_copies].bytes = size;
3508+ h->opencl.num_copies++;
3509+
3510+ size = sizeof(int) * 4;
3511+ locked = x264_opencl_alloc_locked( h, size );
3512+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
3513+ h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
3514+ h->opencl.copies[h->opencl.num_copies].src = locked;
3515+ h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
3516+ h->opencl.num_copies++;
3517+ h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
3518+ h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
3519+ h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
3520+ h->opencl.num_copies++;
3521+
3522+ h->opencl.last_buf = !h->opencl.last_buf;
3523+ return 0;
3524+}
3525+
3526+/* This function was tested emprically on a number of AMD and NV GPUs. Making a
3527+ * function which returns perfect launch dimensions is impossible; some
3528+ * applications will have self-tuning code to try many possible variables and
3529+ * measure the runtime. Here we simply make an educated guess based on what we
3530+ * know GPUs typically prefer.
3531+ */
3532+static void x264_optimal_launch_dims( size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
3533+{
3534+ size_t max_work_group = 256; /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
3535+ size_t preferred_multiple = 64;
3536+ cl_uint num_cus = 6;
3537+
3538+ clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
3539+ clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
3540+ clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
3541+
3542+ ldims[0] = preferred_multiple;
3543+ ldims[1] = 8;
3544+
3545+ /* make ldims[1] an even divisor of gdims[1] */
3546+ while( gdims[1] & (ldims[1] - 1) )
3547+ {
3548+ ldims[0] <<= 1;
3549+ ldims[1] >>= 1;
3550+ }
3551+ /* make total ldims fit under the max work-group dimensions for the device */
3552+ while( ldims[0] * ldims[1] > max_work_group )
3553+ {
3554+ if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
3555+ ldims[1] >>= 1;
3556+ else
3557+ ldims[0] >>= 1;
3558+ }
3559+
3560+ if( ldims[0] > gdims[0] )
3561+ {
3562+ /* remove preferred multiples until we're close to gdims[0] */
3563+ while( gdims[0] + preferred_multiple < ldims[0] )
3564+ ldims[0] -= preferred_multiple;
3565+ gdims[0] = ldims[0];
3566+ }
3567+ else
3568+ {
3569+ /* make gdims an even multiple of ldims */
3570+ gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
3571+ gdims[0] *= ldims[0];
3572+ }
3573+
3574+ /* make ldims smaller to spread work across compute units */
3575+ while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
3576+ {
3577+ if( ldims[0] > preferred_multiple )
3578+ ldims[0] >>= 1;
3579+ else if( ldims[1] > 1 )
3580+ ldims[1] >>= 1;
3581+ else
3582+ break;
3583+ }
3584+ /* for smaller GPUs, try not to abuse their texture cache */
3585+ if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
3586+ ldims[0] = 32;
3587+}
3588+
3589+int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
3590+{
3591+ x264_frame_t *fenc = frames[b];
3592+ x264_frame_t *fref = frames[ref];
3593+
3594+ cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
3595+ cl_mem ref_luma_hpel;
3596+ cl_int status;
3597+
3598+ if( w->weightfn )
3599+ {
3600+ size_t gdims[2];
3601+
3602+ gdims[0] = 8 * h->mb.i_mb_width;
3603+ gdims[1] = 8 * h->mb.i_mb_height;
3604+
3605+ /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
3606+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
3607+ {
3608+ cl_uint arg = 0;
3609+ OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
3610+ OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
3611+ OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
3612+ OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
3613+ OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
3614+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
3615+
3616+ gdims[0] >>= 1;
3617+ gdims[1] >>= 1;
3618+ if( gdims[0] < 16 || gdims[1] < 16 )
3619+ break;
3620+ }
3621+
3622+ cl_uint arg = 0;
3623+ gdims[0] = 8 * h->mb.i_mb_width;
3624+ gdims[1] = 8 * h->mb.i_mb_height;
3625+
3626+ OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
3627+ OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
3628+ OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
3629+ OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
3630+ OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
3631+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
3632+
3633+ /* Use weighted reference planes for motion search */
3634+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
3635+ ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
3636+ ref_luma_hpel = h->opencl.weighted_luma_hpel;
3637+ }
3638+ else
3639+ {
3640+ /* Use unweighted reference planes for motion search */
3641+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
3642+ ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
3643+ ref_luma_hpel = fref->opencl.luma_hpel;
3644+ }
3645+
3646+ const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
3647+ int b_first_iteration = 1;
3648+ int b_reverse_references = 1;
3649+ int A = 1;
3650+
3651+
3652+ int mb_per_group = 0;
3653+ int cost_local_size = 0;
3654+ int mvc_local_size = 0;
3655+ int mb_width;
3656+
3657+ size_t gdims[2];
3658+ size_t ldims[2];
3659+
3660+ /* scale 0 is 8x8 */
3661+ for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
3662+ {
3663+ mb_width = h->mb.i_mb_width >> scale;
3664+ gdims[0] = mb_width;
3665+ gdims[1] = h->mb.i_mb_height >> scale;
3666+ if( gdims[0] < 2 || gdims[1] < 2 )
3667+ continue;
3668+ gdims[0] <<= 2;
3669+ x264_optimal_launch_dims( gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
3670+
3671+ mb_per_group = (ldims[0] >> 2) * ldims[1];
3672+ cost_local_size = 4 * mb_per_group * sizeof(int16_t);
3673+ mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
3674+ int scaled_me_range = h->param.analyse.i_me_range >> scale;
3675+ int b_shift_index = 1;
3676+
3677+ cl_uint arg = 0;
3678+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
3679+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
3680+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
3681+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
3682+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
3683+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
3684+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
3685+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
3686+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
3687+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
3688+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
3689+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
3690+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
3691+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
3692+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
3693+
3694+ for( int iter = 0; iter < num_iterations[scale]; iter++ )
3695+ {
3696+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
3697+
3698+ b_shift_index = 0;
3699+ b_first_iteration = 0;
3700+
3701+ /* alternate top-left vs bot-right MB references at lower scales, so
3702+ * motion field smooths more quickly.
3703+ */
3704+ if( scale > 2 )
3705+ b_reverse_references ^= 1;
3706+ else
3707+ b_reverse_references = 0;
3708+ A = !A;
3709+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
3710+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
3711+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
3712+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
3713+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
3714+ }
3715+ }
3716+
3717+ int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
3718+ cl_uint arg = 0;
3719+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
3720+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
3721+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
3722+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
3723+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
3724+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
3725+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
3726+
3727+ if( b_islist1 )
3728+ {
3729+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
3730+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
3731+ }
3732+ else
3733+ {
3734+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
3735+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
3736+ }
3737+
3738+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
3739+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
3740+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
3741+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
3742+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
3743+
3744+ if( h->opencl.b_device_AMD_SI )
3745+ {
3746+ /* workaround for AMD Southern Island, perform meaningless small copy */
3747+ OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
3748+ }
3749+
3750+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
3751+
3752+ int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
3753+
3754+ if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
3755+ x264_opencl_flush( h );
3756+
3757+ char *locked = x264_opencl_alloc_locked( h, mvlen );
3758+ h->opencl.copies[h->opencl.num_copies].src = locked;
3759+ h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
3760+
3761+ if( b_islist1 )
3762+ {
3763+ int mvs_offset = mvlen * (ref - b - 1);
3764+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
3765+ h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
3766+ }
3767+ else
3768+ {
3769+ int mvs_offset = mvlen * (b - ref - 1);
3770+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
3771+ h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
3772+ }
3773+
3774+ h->opencl.num_copies++;
3775+
3776+ return 0;
3777+}
3778+
3779+int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
3780+{
3781+ cl_int status;
3782+ x264_frame_t *fenc = frames[b];
3783+ x264_frame_t *fref0 = frames[p0];
3784+ x264_frame_t *fref1 = frames[p1];
3785+
3786+ int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
3787+
3788+ /* Tasks for this kernel:
3789+ * 1. Select least cost mode (intra, ref0, ref1)
3790+ * list_used 0, 1, 2, or 3. if B frame, do not allow intra
3791+ * 2. if B frame, try bidir predictions.
3792+ * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
3793+ */
3794+ size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
3795+ size_t ldim_bidir[2];
3796+ size_t *ldims = NULL;
3797+ int cost_local_size = 4;
3798+ int satd_local_size = 4;
3799+ if( b < p1 )
3800+ {
3801+ /* For B frames, use 4 threads per MB for BIDIR checks */
3802+ ldims = ldim_bidir;
3803+ gdims[0] <<= 2;
3804+ x264_optimal_launch_dims( gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
3805+ int mb_per_group = (ldims[0] >> 2) * ldims[1];
3806+ cost_local_size = 4 * mb_per_group * sizeof(int16_t);
3807+ satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
3808+ }
3809+
3810+ cl_uint arg = 0;
3811+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
3812+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
3813+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
3814+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
3815+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
3816+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
3817+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
3818+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
3819+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
3820+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
3821+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
3822+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
3823+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
3824+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
3825+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
3826+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
3827+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
3828+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
3829+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
3830+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
3831+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
3832+
3833+ /* Sum costs across rows, atomicAdd down frame */
3834+ size_t gdim[2] = { 256, h->mb.i_mb_height };
3835+ size_t ldim[2] = { 256, 1 };
3836+
3837+ arg = 0;
3838+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
3839+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
3840+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
3841+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
3842+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
3843+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
3844+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
3845+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
3846+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
3847+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
3848+
3849+ if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
3850+ x264_opencl_flush( h );
3851+
3852+ int size = h->mb.i_mb_count * sizeof(int16_t);
3853+ char *locked = x264_opencl_alloc_locked( h, size );
3854+ h->opencl.copies[h->opencl.num_copies].src = locked;
3855+ h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
3856+ h->opencl.copies[h->opencl.num_copies].bytes = size;
3857+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
3858+ h->opencl.num_copies++;
3859+
3860+ size = h->mb.i_mb_height * sizeof(int);
3861+ locked = x264_opencl_alloc_locked( h, size );
3862+ h->opencl.copies[h->opencl.num_copies].src = locked;
3863+ h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
3864+ h->opencl.copies[h->opencl.num_copies].bytes = size;
3865+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
3866+ h->opencl.num_copies++;
3867+
3868+ size = 4 * sizeof(int);
3869+ locked = x264_opencl_alloc_locked( h, size );
3870+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
3871+ h->opencl.last_buf = !h->opencl.last_buf;
3872+
3873+ h->opencl.copies[h->opencl.num_copies].src = locked;
3874+ h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
3875+ h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
3876+ h->opencl.num_copies++;
3877+ h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
3878+ h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
3879+ h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
3880+ h->opencl.num_copies++;
3881+
3882+ if( b == p1 ) // P frames only
3883+ {
3884+ h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
3885+ h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
3886+ h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
3887+ h->opencl.num_copies++;
3888+ }
3889+ return 0;
3890+}
3891+
3892+int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b )
3893+{
3894+ if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
3895+ return 0;
3896+ else
3897+ {
3898+ int do_search[2];
3899+ int dist_scale_factor = 128;
3900+ const x264_weight_t *w = x264_weight_none;
3901+
3902+ // avoid duplicating work
3903+ frames[b]->i_cost_est[b-p0][p1-b] = 0;
3904+
3905+ do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
3906+ do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
3907+ if( do_search[0] )
3908+ {
3909+ if( h->param.analyse.i_weighted_pred && b == p1 )
3910+ {
3911+ x264_emms();
3912+ x264_weights_analyse( h, frames[b], frames[p0], 1 );
3913+ w = frames[b]->weight[0];
3914+ }
3915+ frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
3916+ }
3917+ if( do_search[1] )
3918+ frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
3919+ if( b == p1 )
3920+ frames[b]->i_intra_mbs[b-p0] = 0;
3921+ if( p1 != p0 )
3922+ dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
3923+
3924+ frames[b]->i_cost_est[b-p0][p1-b] = 0;
3925+ frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
3926+
3927+ x264_opencl_lowres_init( h, frames[b], lambda );
3928+
3929+ if( do_search[0] )
3930+ {
3931+ x264_opencl_lowres_init( h, frames[p0], lambda );
3932+ x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
3933+ }
3934+ if( do_search[1] )
3935+ {
3936+ x264_opencl_lowres_init( h, frames[p1], lambda );
3937+ x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, w );
3938+ }
3939+ x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor );
3940+ return 1;
3941+ }
3942+}
3943+
3944+#endif
3945diff -r bdffc2c1e85b -r e5e4a79d3f21 encoder/slicetype.c
3946--- a/encoder/slicetype.c Wed Jul 18 08:33:41 2012 -0700
3947+++ b/encoder/slicetype.c Mon Aug 20 22:33:23 2012 -0500
3948@@ -36,6 +36,19 @@
3949 x264_frame_t **frames, int p0, int p1, int b,
3950 int b_intra_penalty );
3951
3952+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
3953+
3954+#if HAVE_OPENCL
3955+#if _WIN32
3956+#include "windows.h"
3957+#endif
3958+int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda );
3959+int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w );
3960+int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor );
3961+int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b );
3962+void x264_opencl_flush( x264_t *h );
3963+#endif
3964+
3965 static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
3966 {
3967 a->i_qp = X264_LOOKAHEAD_QP;
3968@@ -276,7 +289,7 @@
3969 return cost;
3970 }
3971
3972-static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
3973+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
3974 {
3975 int i_delta_index = fenc->i_frame - ref->i_frame - 1;
3976 /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
3977@@ -793,96 +806,120 @@
3978 output_inter[0] = h->scratch_buffer2;
3979 output_intra[0] = output_inter[0] + output_buf_size;
3980
3981- if( h->param.i_lookahead_threads > 1 )
3982+#if HAVE_OPENCL
3983+ if( h->param.b_opencl )
3984 {
3985- x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
3986+ x264_opencl_lowres_init(h, fenc, a->i_lambda );
3987+ if( do_search[0] )
3988+ {
3989+ x264_opencl_lowres_init( h, frames[p0], a->i_lambda );
3990+ x264_opencl_motionsearch( h, frames, b, p0, 0, a->i_lambda, w );
3991+ }
3992+ if( do_search[1] )
3993+ {
3994+ x264_opencl_lowres_init( h, frames[p1], a->i_lambda );
3995+ x264_opencl_motionsearch( h, frames, b, p1, 1, a->i_lambda, w );
3996+ }
3997+ if( b != p0 )
3998+ x264_opencl_finalize_cost( h, a->i_lambda, frames, p0, p1, b, dist_scale_factor );
3999+ x264_opencl_flush( h );
4000
4001+ i_score = fenc->i_cost_est[b-p0][p1-b];
4002+ }
4003+ else
4004+#endif
4005+ {
4006+ if( h->param.i_lookahead_threads > 1 )
4007+ {
4008+ x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
4009+
4010+ for( int i = 0; i < h->param.i_lookahead_threads; i++ )
4011+ {
4012+ x264_t *t = h->lookahead_thread[i];
4013+
4014+ /* FIXME move this somewhere else */
4015+ t->mb.i_me_method = h->mb.i_me_method;
4016+ t->mb.i_subpel_refine = h->mb.i_subpel_refine;
4017+ t->mb.b_chroma_me = h->mb.b_chroma_me;
4018+
4019+ s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
4020+ output_inter[i], output_intra[i] };
4021+
4022+ t->i_threadslice_start = ((h->mb.i_mb_height * i + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
4023+ t->i_threadslice_end = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
4024+
4025+ int thread_height = t->i_threadslice_end - t->i_threadslice_start;
4026+ int thread_output_size = thread_height + NUM_INTS;
4027+ memset( output_inter[i], 0, thread_output_size * sizeof(int) );
4028+ memset( output_intra[i], 0, thread_output_size * sizeof(int) );
4029+ output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
4030+
4031+ output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
4032+ output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
4033+
4034+ x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] );
4035+ }
4036+ for( int i = 0; i < h->param.i_lookahead_threads; i++ )
4037+ x264_threadpool_wait( h->lookaheadpool, &s[i] );
4038+ }
4039+ else
4040+ {
4041+ h->i_threadslice_start = 0;
4042+ h->i_threadslice_end = h->mb.i_mb_height;
4043+ memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
4044+ memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
4045+ output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
4046+ x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
4047+ output_inter[0], output_intra[0] };
4048+ x264_slicetype_slice_cost( &s );
4049+ }
4050+
4051+ /* Sum up accumulators */
4052+ if( b == p1 )
4053+ fenc->i_intra_mbs[b-p0] = 0;
4054+ if( !fenc->b_intra_calculated )
4055+ {
4056+ fenc->i_cost_est[0][0] = 0;
4057+ fenc->i_cost_est_aq[0][0] = 0;
4058+ }
4059+ fenc->i_cost_est[b-p0][p1-b] = 0;
4060+ fenc->i_cost_est_aq[b-p0][p1-b] = 0;
4061+
4062+ int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b];
4063+ int *row_satd_intra = fenc->i_row_satds[0][0];
4064 for( int i = 0; i < h->param.i_lookahead_threads; i++ )
4065 {
4066- x264_t *t = h->lookahead_thread[i];
4067+ if( b == p1 )
4068+ fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS];
4069+ if( !fenc->b_intra_calculated )
4070+ {
4071+ fenc->i_cost_est[0][0] += output_intra[i][COST_EST];
4072+ fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ];
4073+ }
4074
4075- /* FIXME move this somewhere else */
4076- t->mb.i_me_method = h->mb.i_me_method;
4077- t->mb.i_subpel_refine = h->mb.i_subpel_refine;
4078- t->mb.b_chroma_me = h->mb.b_chroma_me;
4079+ fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST];
4080+ fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ];
4081
4082- s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
4083- output_inter[i], output_intra[i] };
4084-
4085- t->i_threadslice_start = ((h->mb.i_mb_height * i + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
4086- t->i_threadslice_end = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
4087-
4088- int thread_height = t->i_threadslice_end - t->i_threadslice_start;
4089- int thread_output_size = thread_height + NUM_INTS;
4090- memset( output_inter[i], 0, thread_output_size * sizeof(int) );
4091- memset( output_intra[i], 0, thread_output_size * sizeof(int) );
4092- output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
4093-
4094- output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
4095- output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
4096-
4097- x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] );
4098- }
4099- for( int i = 0; i < h->param.i_lookahead_threads; i++ )
4100- x264_threadpool_wait( h->lookaheadpool, &s[i] );
4101- }
4102- else
4103- {
4104- h->i_threadslice_start = 0;
4105- h->i_threadslice_end = h->mb.i_mb_height;
4106- memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
4107- memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
4108- output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
4109- x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
4110- output_inter[0], output_intra[0] };
4111- x264_slicetype_slice_cost( &s );
4112- }
4113-
4114- /* Sum up accumulators */
4115- if( b == p1 )
4116- fenc->i_intra_mbs[b-p0] = 0;
4117- if( !fenc->b_intra_calculated )
4118- {
4119- fenc->i_cost_est[0][0] = 0;
4120- fenc->i_cost_est_aq[0][0] = 0;
4121- }
4122- fenc->i_cost_est[b-p0][p1-b] = 0;
4123- fenc->i_cost_est_aq[b-p0][p1-b] = 0;
4124-
4125- int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b];
4126- int *row_satd_intra = fenc->i_row_satds[0][0];
4127- for( int i = 0; i < h->param.i_lookahead_threads; i++ )
4128- {
4129- if( b == p1 )
4130- fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS];
4131- if( !fenc->b_intra_calculated )
4132- {
4133- fenc->i_cost_est[0][0] += output_intra[i][COST_EST];
4134- fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ];
4135+ if( h->param.rc.i_vbv_buffer_size )
4136+ {
4137+ int row_count = output_inter[i][NUM_ROWS];
4138+ memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) );
4139+ if( !fenc->b_intra_calculated )
4140+ memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) );
4141+ row_satd_inter += row_count;
4142+ row_satd_intra += row_count;
4143+ }
4144 }
4145
4146- fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST];
4147- fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ];
4148+ i_score = fenc->i_cost_est[b-p0][p1-b];
4149+ if( b != p1 )
4150+ i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
4151+ else
4152+ fenc->b_intra_calculated = 1;
4153
4154- if( h->param.rc.i_vbv_buffer_size )
4155- {
4156- int row_count = output_inter[i][NUM_ROWS];
4157- memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) );
4158- if( !fenc->b_intra_calculated )
4159- memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) );
4160- row_satd_inter += row_count;
4161- row_satd_intra += row_count;
4162- }
4163+ fenc->i_cost_est[b-p0][p1-b] = i_score;
4164+ x264_emms();
4165 }
4166-
4167- i_score = fenc->i_cost_est[b-p0][p1-b];
4168- if( b != p1 )
4169- i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
4170- else
4171- fenc->b_intra_calculated = 1;
4172-
4173- fenc->i_cost_est[b-p0][p1-b] = i_score;
4174- x264_emms();
4175 }
4176
4177 if( b_intra_penalty )
4178@@ -1447,6 +1484,62 @@
4179 return;
4180 }
4181
4182+#if HAVE_OPENCL
4183+
4184+#if _WIN32
4185+ int my_pri = THREAD_PRIORITY_NORMAL, ocl_pri = THREAD_PRIORITY_NORMAL;
4186+
4187+ /* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not
4188+ * defined in the OpenCL headers shipped with NVIDIA drivers. We need
4189+ * to be able to compile on an NVIDIA machine and run optimally on an
4190+ * AMD GPU.
4191+ */
4192+#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
4193+#endif
4194+
4195+ if( h->param.b_opencl )
4196+ {
4197+#if _WIN32
4198+ /* Temporarily boost priority of this lookahead thread and the OpenCL
4199+ * driver's thread until the end of this function. On AMD GPUs this
4200+ * greatly reduces the latency of enqueuing kernels and getting results
4201+ * on Windows.
4202+ */
4203+ HANDLE id = GetCurrentThread();
4204+ my_pri = GetThreadPriority( id );
4205+ SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
4206+ cl_int status = clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
4207+ if( status == CL_SUCCESS )
4208+ {
4209+ ocl_pri = GetThreadPriority( id );
4210+ SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
4211+ }
4212+#endif
4213+
4214+ int b_work_done = 0;
4215+
4216+ /* precalculate intra and I frames */
4217+ for( int i = 0; i <= num_frames; i++ )
4218+ {
4219+ b_work_done |= x264_opencl_lowres_init( h, frames[i], a.i_lambda );
4220+ }
4221+
4222+ /* Because of weightp, we cannot precalculate P frame costs or backwards
4223+ * motion searches. And because motion searches are never
4224+ * re-calculated, we can't to non-weightp searches for B frames either.
4225+ * So we essentially to all cost calculations synchronously as the
4226+ * lookahead thread determines it needs them. If some enterprising
4227+ * individual were to do weightp analysis on the GPU, we could do more
4228+ * pre-calculations asynchronously.
4229+ */
4230+ if( b_work_done )
4231+ {
4232+ /* weightp estimation requires intra costs to be on the CPU */
4233+ x264_opencl_flush( h );
4234+ }
4235+ }
4236+#endif
4237+
4238 if( h->param.i_bframe )
4239 {
4240 if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
4241@@ -1480,6 +1573,18 @@
4242 continue;
4243 }
4244
4245+#if HAVE_OPENCL
4246+ if( h->param.b_opencl )
4247+ {
4248+ int b_work_done = 0;
4249+ b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+2, i+1 );
4250+ b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+1, i+1 );
4251+ b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+1, i+2, i+2 );
4252+ if( b_work_done )
4253+ x264_opencl_flush( h );
4254+ }
4255+#endif
4256+
4257 cost1b1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+1, 0 );
4258 cost1p0 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+1, i+1, 0 );
4259 cost2p0 = x264_slicetype_frame_cost( h, &a, frames, i+1, i+2, i+2, 0 );
4260@@ -1562,6 +1667,19 @@
4261 /* Restore frametypes for all frames that haven't actually been decided yet. */
4262 for( int j = reset_start; j <= num_frames; j++ )
4263 frames[j]->i_type = X264_TYPE_AUTO;
4264+
4265+#if HAVE_OPENCL
4266+#if _WIN32
4267+ if( h->param.b_opencl )
4268+ {
4269+ HANDLE id = GetCurrentThread();
4270+ SetThreadPriority( id, my_pri );
4271+ cl_int status = clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
4272+ if( status == CL_SUCCESS )
4273+ SetThreadPriority( id, ocl_pri );
4274+ }
4275+#endif
4276+#endif
4277 }
4278
4279 void x264_slicetype_decide( x264_t *h )
4280diff -r bdffc2c1e85b -r e5e4a79d3f21 x264.c
4281--- a/x264.c Wed Jul 18 08:33:41 2012 -0700
4282+++ b/x264.c Mon Aug 20 22:33:23 2012 -0500
4283@@ -806,6 +806,8 @@
4284 " as opposed to letting them select different algorithms\n" );
4285 H2( " --asm <integer> Override CPU detection\n" );
4286 H2( " --no-asm Disable all CPU optimizations\n" );
4287+ H2( " --opencl Enable use of OpenCL\n" );
4288+ H2( " --clbin-file <string> Specify path of compiled OpenCL kernel cache\n" );
4289 H2( " --visualize Show MB types overlayed on the encoded video\n" );
4290 H2( " --dump-yuv <string> Save reconstructed frames\n" );
4291 H2( " --sps-id <integer> Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
4292@@ -910,6 +912,8 @@
4293 { "ref", required_argument, NULL, 'r' },
4294 { "asm", required_argument, NULL, 0 },
4295 { "no-asm", no_argument, NULL, 0 },
4296+ { "opencl", no_argument, NULL, 1 },
4297+ { "clbin-file", required_argument, NULL, 0 },
4298 { "sar", required_argument, NULL, 0 },
4299 { "fps", required_argument, NULL, OPT_FPS },
4300 { "frames", required_argument, NULL, OPT_FRAMES },
4301diff -r bdffc2c1e85b -r e5e4a79d3f21 x264.h
4302--- a/x264.h Wed Jul 18 08:33:41 2012 -0700
4303+++ b/x264.h Mon Aug 20 22:33:23 2012 -0500
4304@@ -41,7 +41,7 @@
4305
4306 #include "x264_config.h"
4307
4308-#define X264_BUILD 125
4309+#define X264_BUILD 126
4310
4311 /* Application developers planning to link against a shared library version of
4312 * libx264 from a Microsoft Visual Studio or similar development environment
4313@@ -463,6 +463,9 @@
4314
4315 int b_fake_interlaced;
4316
4317+ int b_opencl; /* use OpenCL when available */
4318+ char *psz_clbin_file; /* compiled OpenCL kernel cache file */
4319+
4320 /* Slicing parameters */
4321 int i_slice_max_size; /* Max size per slice in bytes; includes estimated NAL overhead. */
4322 int i_slice_max_mbs; /* Max number of MBs per slice; overrides i_slice_count. */