9gCAYiTT

· 7 years ago · Nov 29, 2018, 11:56 PM
1# HG changeset patch
2# User Steve Borho <steve@borho.org>
3# Date 1345520003 18000
4# Node ID e5e4a79d3f21a849f65a059ab5b42332a7b4a672
5# Parent  bdffc2c1e85bd6cfbef707910bd7a14d844c95c9
6OpenCL Lookahead
7
8diff -r bdffc2c1e85b -r e5e4a79d3f21 Makefile
9--- a/Makefile	Wed Jul 18 08:33:41 2012 -0700
10+++ b/Makefile	Mon Aug 20 22:33:23 2012 -0500
11@@ -8,6 +8,8 @@
12 vpath %.asm $(SRCPATH)
13 vpath %.rc $(SRCPATH)
14 
15+GENERATED =
16+
17 all: default
18 default:
19 
20@@ -145,6 +147,37 @@
21 endif
22 endif
23 
24+QUOTED_CFLAGS := $(CFLAGS)
25+
26+ifeq ($(HAVE_OPENCL),yes)
27+empty:=
28+space:=$(empty) $(empty)
29+escaped:=\ $(empty)
30+open:=(
31+escopen:=\(
32+close:=)
33+escclose:=\)
34+SAFE_INC_DIR := $(subst $(space),$(escaped),$(OPENCL_INC_DIR))
35+SAFE_INC_DIR := $(subst $(open),$(escopen),$(SAFE_INC_DIR))
36+SAFE_INC_DIR := $(subst $(close),$(escclose),$(SAFE_INC_DIR))
37+SAFE_LIB_DIR := $(subst $(space),$(escaped),$(OPENCL_LIB_DIR))
38+SAFE_LIB_DIR := $(subst $(open),$(escopen),$(SAFE_LIB_DIR))
39+SAFE_LIB_DIR := $(subst $(close),$(escclose),$(SAFE_LIB_DIR))
40+# For normal CFLAGS and LDFLAGS, we must escape spaces with a backslash to
41+# make gcc happy
42+CFLAGS += -I$(SAFE_INC_DIR) -DCL_USE_DEPRECATED_OPENCL_1_1_APIS
43+LDFLAGS += -l$(OPENCL_LIB) -L$(SAFE_LIB_DIR)
44+# For the CFLAGS used by the .depend rule, we must add quotes because
45+# the rule does an extra level of shell expansions
46+QUOTED_CFLAGS += -I"$(OPENCL_INC_DIR)" -DCL_USE_DEPRECATED_OPENCL_1_1_APIS
47+common/oclobj.h: common/opencl/x264-cl.h $(wildcard common/opencl/*.cl)
48+	echo "static const char x264_opencl_source [] = {" > $@
49+	cat $^ | xxd -i >> $@
50+	echo ",0x00 };" >> $@
51+GENERATED += common/oclobj.h
52+SRCS += common/opencl.c encoder/slicetype-cl.c
53+endif
54+
55 OBJS   += $(SRCS:%.c=%.o)
56 OBJCLI += $(SRCCLI:%.c=%.o)
57 OBJSO  += $(SRCSO:%.c=%.o)
58@@ -164,17 +197,22 @@
59 	$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
60 
61 ifneq ($(EXE),)
62-.PHONY: x264 checkasm
63+.PHONY: x264 checkasm test-opencl
64 x264: x264$(EXE)
65 checkasm: checkasm$(EXE)
66+test-opencl: test-opencl$(EXE)
67 endif
68 
69-x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
70+x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
71 	$(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
72 
73-checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
74+checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
75 	$(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
76 
77+OBJOCL = tools/test-opencl.o
78+test-opencl$(EXE): .depend ${OBJOCL} $(LIBX264)
79+	$(LD)$@ $(OBJOCL) $(LIBX264) $(LDFLAGS)
80+
81 $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
82 
83 %.o: %.asm
84@@ -193,7 +231,7 @@
85 
86 .depend: config.mak
87 	@rm -f .depend
88-	@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
89+	@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(QUOTED_CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
90 
91 config.mak:
92 	./configure
93@@ -231,7 +269,7 @@
94 
95 clean:
96 	rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
97-	rm -f checkasm checkasm.exe $(OBJCHK)
98+	rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin
99 	rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock
100 
101 distclean: clean
102diff -r bdffc2c1e85b -r e5e4a79d3f21 common/common.c
103--- a/common/common.c	Wed Jul 18 08:33:41 2012 -0700
104+++ b/common/common.c	Mon Aug 20 22:33:23 2012 -0500
105@@ -171,6 +171,10 @@
106     param->b_pic_struct = 0;
107     param->b_fake_interlaced = 0;
108     param->i_frame_packing = -1;
109+#if HAVE_OPENCL
110+    param->b_opencl = 0;
111+    param->psz_clbin_file = NULL;
112+#endif
113 }
114 
115 static int x264_param_apply_preset( x264_param_t *param, const char *preset )
116@@ -1029,6 +1033,10 @@
117         p->b_fake_interlaced = atobool(value);
118     OPT("frame-packing")
119         p->i_frame_packing = atoi(value);
120+    OPT("opencl")
121+        p->b_opencl = atobool( value );
122+    OPT("clbin-file")
123+        p->psz_clbin_file = strdup( value );
124     else
125         return X264_PARAM_BAD_NAME;
126 #undef OPT
127diff -r bdffc2c1e85b -r e5e4a79d3f21 common/common.h
128--- a/common/common.h	Wed Jul 18 08:33:41 2012 -0700
129+++ b/common/common.h	Mon Aug 20 22:33:23 2012 -0500
130@@ -93,6 +93,11 @@
131 #include <assert.h>
132 #include <limits.h>
133 
134+#if HAVE_OPENCL
135+#    include <CL/cl.h>
136+#    include "opencl.h"
137+#endif
138+
139 #if HAVE_INTERLACED
140 #   define MB_INTERLACED h->mb.b_interlaced
141 #   define SLICE_MBAFF h->sh.b_mbaff
142@@ -947,8 +952,19 @@
143     struct visualize_t *visualize;
144 #endif
145     x264_lookahead_t *lookahead;
146+
147+#if HAVE_OPENCL
148+    x264_opencl_t opencl;
149+#endif
150 };
151 
152+#if HAVE_OPENCL
153+int x264_opencl_init( x264_t *h );
154+int x264_opencl_init_lookahead( x264_t *h );
155+void x264_opencl_free( x264_t *h );
156+void x264_opencl_frame_delete( x264_frame_t *frame );
157+#endif
158+
159 // included at the end because it needs x264_t
160 #include "macroblock.h"
161 
162diff -r bdffc2c1e85b -r e5e4a79d3f21 common/frame.c
163--- a/common/frame.c	Wed Jul 18 08:33:41 2012 -0700
164+++ b/common/frame.c	Mon Aug 20 22:33:23 2012 -0500
165@@ -302,6 +302,9 @@
166         x264_free( frame->ref[1] );
167         x264_pthread_mutex_destroy( &frame->mutex );
168         x264_pthread_cond_destroy( &frame->cv );
169+#if HAVE_OPENCL
170+        x264_opencl_frame_delete( frame );
171+#endif
172     }
173     x264_free( frame );
174 }
175diff -r bdffc2c1e85b -r e5e4a79d3f21 common/frame.h
176--- a/common/frame.h	Wed Jul 18 08:33:41 2012 -0700
177+++ b/common/frame.h	Mon Aug 20 22:33:23 2012 -0500
178@@ -170,6 +170,10 @@
179     /* user frame properties */
180     uint8_t *mb_info;
181     void (*mb_info_free)( void* );
182+
183+#if HAVE_OPENCL
184+    x264_frame_opencl_t opencl;
185+#endif
186 } x264_frame_t;
187 
188 /* synchronized frame list */
189diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl.c
190--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
191+++ b/common/opencl.c	Mon Aug 20 22:33:23 2012 -0500
192@@ -0,0 +1,456 @@
193+/*****************************************************************************
194+ * opencl.c: OpenCL initialization and kernel compilation
195+ *****************************************************************************
196+ * Copyright (C) 2012 x264 project
197+ *
198+ * Authors: Steve Borho <sborho@multicorewareinc.com>
199+ *
200+ * This program is free software; you can redistribute it and/or modify
201+ * it under the terms of the GNU General Public License as published by
202+ * the Free Software Foundation; either version 2 of the License, or
203+ * (at your option) any later version.
204+ *
205+ * This program is distributed in the hope that it will be useful,
206+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
207+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
208+ * GNU General Public License for more details.
209+ *
210+ * You should have received a copy of the GNU General Public License
211+ * along with this program; if not, write to the Free Software
212+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
213+ *
214+ * This program is also available under a commercial proprietary license.
215+ * For more information, contact us at licensing@x264.com.
216+ *****************************************************************************/
217+
218+#include "common.h"
219+
220+#if HAVE_OPENCL
221+
222+#include "oclobj.h"
223+
224+/* Try to load the cached compiled program binary, verify the device context is
225+ * still valid before reuse */
226+static cl_program x264_opencl_cache_load( x264_t *h, char *devname, char *devvendor, char *driverversion )
227+{
228+    cl_program program = NULL;
229+    cl_int status;
230+
231+    /* try to load cached program binary */
232+    FILE *fp = fopen( h->param.psz_clbin_file, "rb" );
233+    if( !fp )
234+        return NULL;
235+
236+    fseek( fp, 0L, SEEK_END );
237+    size_t size = ftell( fp );
238+    rewind( fp );
239+    uint8_t *binary = x264_malloc( size );
240+    if( !binary )
241+        goto fail;
242+
243+    fread( binary, 1, size, fp );
244+    const uint8_t *ptr = (const uint8_t*)binary;
245+
246+#define CHECK_STRING( STR )\
247+    do {\
248+        size_t len = strlen( STR );\
249+        if( size <= len || strncmp( (char*)ptr, STR, len ) )\
250+            goto fail;\
251+        else {\
252+            size -= (len+1); ptr += (len+1);\
253+        }\
254+    } while( 0 )
255+
256+    CHECK_STRING( devname );
257+    CHECK_STRING( devvendor );
258+    CHECK_STRING( driverversion );
259+#undef CHECK_STRING
260+
261+    program = clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status );
262+    if( status != CL_SUCCESS )
263+        program = NULL;
264+
265+fail:
266+    fclose( fp );
267+    x264_free( binary );
268+    return program;
269+}
270+
271+/* Save the compiled program binary to a file for later reuse.  Device context
272+ * is also saved in the cache file so we do not reuse stale binaries */
273+static void x264_opencl_cache_save( x264_t *h, cl_program program, char *devname, char *devvendor, char *driverversion )
274+{
275+    FILE *fp = fopen( h->param.psz_clbin_file, "wb" );
276+    if( !fp )
277+        return;
278+
279+    size_t size;
280+    cl_int status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL );
281+    if( status == CL_SUCCESS )
282+    {
283+        unsigned char *binary = x264_malloc( size );
284+        status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &binary, NULL );
285+        if( status == CL_SUCCESS )
286+        {
287+            fwrite( devname, 1, strlen( devname ), fp );
288+            fwrite( "\n", 1, 1, fp );
289+            fwrite( devvendor, 1, strlen( devvendor ), fp );
290+            fwrite( "\n", 1, 1, fp );
291+            fwrite( driverversion, 1, strlen( driverversion ), fp );
292+            fwrite( "\n", 1, 1, fp );
293+            fwrite( binary, 1, size, fp );
294+        }
295+        x264_free( binary );
296+    }
297+    fclose( fp );
298+}
299+
300+static int x264_detect_AMD_GPU( cl_device_id device, int *b_is_SI )
301+{
302+    char extensions[512];
303+    char boardname[128];
304+    char devname[64];
305+
306+    *b_is_SI = 0;
307+
308+    cl_int status = clGetDeviceInfo( device, CL_DEVICE_EXTENSIONS, sizeof(extensions), extensions, NULL );
309+    if( status != CL_SUCCESS || !strstr( extensions, "cl_amd_media_ops" ) )
310+        return 0;
311+
312+    /* Detect whether GPU is a SouthernIsland based device */
313+#define CL_DEVICE_BOARD_NAME_AMD                    0x4038
314+    status = clGetDeviceInfo( device, CL_DEVICE_BOARD_NAME_AMD, sizeof(boardname), boardname, NULL );
315+    if( status == CL_SUCCESS )
316+    {
317+        boardname[15] = boardname[16] = boardname[17] = 'X';
318+        *b_is_SI = !strcmp( boardname, "AMD Radeon HD 7XXX Series" );
319+        return 1;
320+    }
321+
322+    /* Fall back to checking the device name */
323+    status = clGetDeviceInfo( device, CL_DEVICE_NAME, sizeof(devname), devname, NULL );
324+    if( status != CL_SUCCESS )
325+        return 1;
326+
327+    const char *tahiti_names[] = { "Tahiti", "Pitcairn", "Capeverde", "Bali", NULL };
328+    for( int i = 0; tahiti_names[i]; i++ )
329+        if( !strcmp( devname, tahiti_names[i] ) )
330+        {
331+            *b_is_SI = 1;
332+            return 1;
333+        }
334+    return 1;
335+}
336+
337+/* The OpenCL source under common/opencl will be merged into encoder/oclobj.h by
338+ * the Makefile. It defines a x264_opencl_source byte array which we will pass
339+ * to clCreateProgramWithSource().  We also attempt to use a cache file for the
340+ * compiled binary, stored in the current working folder.  */
341+static cl_program x264_opencl_compile( x264_t *h )
342+{
343+    cl_program program;
344+    cl_int status;
345+
346+    char devname[64];
347+    char devvendor[64];
348+    char driverversion[64];
349+    status  = clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME,    sizeof(devname), devname, NULL );
350+    status |= clGetDeviceInfo( h->opencl.device, CL_DEVICE_VENDOR,  sizeof(devvendor), devvendor, NULL );
351+    status |= clGetDeviceInfo( h->opencl.device, CL_DRIVER_VERSION, sizeof(driverversion), driverversion, NULL );
352+    if( status != CL_SUCCESS )
353+        return NULL;
354+
355+    int b_isamd = x264_detect_AMD_GPU( h->opencl.device, &h->opencl.b_device_AMD_SI );
356+
357+    x264_log( h, X264_LOG_INFO, "OpenCL acceleration enabled with %s %s %s\n", devvendor, devname, h->opencl.b_device_AMD_SI ? "(SI)" : "" );
358+
359+    program = x264_opencl_cache_load( h, devname, devvendor, driverversion );
360+    if( !program )
361+    {
362+        /* clCreateProgramWithSource() requires a pointer variable, you cannot just use &x264_opencl_source */
363+        x264_log( h, X264_LOG_INFO, "Compiling OpenCL kernels...\n" );
364+        const char *strptr = (const char*)x264_opencl_source;
365+        size_t size = sizeof(x264_opencl_source);
366+        program = clCreateProgramWithSource( h->opencl.context, 1, &strptr, &size, &status );
367+        if( status != CL_SUCCESS || !program )
368+        {
369+            x264_log( h, X264_LOG_WARNING, "OpenCL: unable to create program\n" );
370+            return NULL;
371+        }
372+    }
373+
374+    /* Build the program binary for the OpenCL device */
375+    const char *buildopts = "";
376+    if( b_isamd && !h->opencl.b_device_AMD_SI )
377+        buildopts = "-DVECTORIZE=1";
378+    status = clBuildProgram( program, 1, &h->opencl.device, buildopts, NULL, NULL );
379+    if( status == CL_SUCCESS )
380+    {
381+        x264_opencl_cache_save( h, program, devname, devvendor, driverversion );
382+        return program;
383+    }
384+
385+    /* Compile failure, should not happen with production code. */
386+
387+    size_t build_log_len = 0;
388+
389+    status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, NULL, &build_log_len );
390+    if( status != CL_SUCCESS )
391+    {
392+        x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to query build log\n" );
393+        return NULL;
394+    }
395+
396+    char *build_log = x264_malloc( build_log_len );
397+    if( !build_log )
398+    {
399+        x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to alloc build log\n" );
400+        return NULL;
401+    }
402+
403+    status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, build_log, NULL );
404+    if( status != CL_SUCCESS )
405+    {
406+        x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to get build log\n" );
407+        x264_free( build_log );
408+        return NULL;
409+    }
410+
411+    FILE *lg = fopen( "x264_kernel_build_log.txt", "w" );
412+    if( lg )
413+    {
414+        fwrite( build_log, 1, build_log_len, lg );
415+        fclose( lg );
416+        x264_log( h, X264_LOG_WARNING, "OpenCL: kernel build errors written to x264_kernel_build_log.txt\n" );
417+    }
418+
419+    x264_free( build_log );
420+    return NULL;
421+}
422+
423+static void x264_opencl_free_lookahead( x264_t *h )
424+{
425+#define RELEASE( a, f ) if( a ) f( a );
426+    RELEASE( h->opencl.intra_kernel, clReleaseKernel )
427+    RELEASE( h->opencl.rowsum_intra_kernel, clReleaseKernel )
428+    RELEASE( h->opencl.downscale_kernel1, clReleaseKernel )
429+    RELEASE( h->opencl.downscale_kernel2, clReleaseKernel )
430+    RELEASE( h->opencl.downscale_hpel_kernel, clReleaseKernel )
431+    RELEASE( h->opencl.weightp_hpel_kernel, clReleaseKernel )
432+    RELEASE( h->opencl.weightp_scaled_images_kernel, clReleaseKernel )
433+    RELEASE( h->opencl.memset_kernel, clReleaseKernel )
434+    RELEASE( h->opencl.hme_kernel, clReleaseKernel )
435+    RELEASE( h->opencl.subpel_refine_kernel, clReleaseKernel )
436+    RELEASE( h->opencl.mode_select_kernel, clReleaseKernel )
437+    RELEASE( h->opencl.rowsum_inter_kernel, clReleaseKernel )
438+    RELEASE( h->opencl.lookahead_program, clReleaseProgram )
439+    RELEASE( h->opencl.row_satds[0], clReleaseMemObject )
440+    RELEASE( h->opencl.row_satds[1], clReleaseMemObject )
441+    RELEASE( h->opencl.frame_stats[0], clReleaseMemObject )
442+    RELEASE( h->opencl.frame_stats[1], clReleaseMemObject )
443+    RELEASE( h->opencl.mv_buffers[0], clReleaseMemObject )
444+    RELEASE( h->opencl.mv_buffers[1], clReleaseMemObject )
445+    RELEASE( h->opencl.mvp_buffer, clReleaseMemObject )
446+    RELEASE( h->opencl.luma_16x16_image[0], clReleaseMemObject )
447+    RELEASE( h->opencl.luma_16x16_image[1], clReleaseMemObject )
448+    RELEASE( h->opencl.lowres_mv_costs, clReleaseMemObject )
449+    RELEASE( h->opencl.lowres_costs[0], clReleaseMemObject )
450+    RELEASE( h->opencl.lowres_costs[1], clReleaseMemObject )
451+    RELEASE( h->opencl.page_locked_buffer, clReleaseMemObject )
452+    RELEASE( h->opencl.weighted_luma_hpel, clReleaseMemObject )
453+    for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
454+    {
455+        RELEASE( h->opencl.weighted_scaled_images[i], clReleaseMemObject )
456+    }
457+#undef RELEASE
458+}
459+
460+int x264_opencl_init_lookahead( x264_t *h )
461+{
462+    if( h->param.rc.i_lookahead == 0 )
463+        return -1;
464+
465+    char *kernelnames[] = {
466+        "mb_intra_cost_satd_8x8",
467+        "sum_intra_cost",
468+        "downscale_hpel",
469+        "downscale1",
470+        "downscale2",
471+        "memset_int16",
472+        "weightp_scaled_images",
473+        "weightp_hpel",
474+        "hierarchical_motion",
475+        "subpel_refine",
476+        "mode_selection",
477+        "sum_inter_cost"
478+    };
479+    cl_kernel *kernels[] = {
480+        &h->opencl.intra_kernel,
481+        &h->opencl.rowsum_intra_kernel,
482+        &h->opencl.downscale_hpel_kernel,
483+        &h->opencl.downscale_kernel1,
484+        &h->opencl.downscale_kernel2,
485+        &h->opencl.memset_kernel,
486+        &h->opencl.weightp_scaled_images_kernel,
487+        &h->opencl.weightp_hpel_kernel,
488+        &h->opencl.hme_kernel,
489+        &h->opencl.subpel_refine_kernel,
490+        &h->opencl.mode_select_kernel,
491+        &h->opencl.rowsum_inter_kernel
492+    };
493+    cl_int status;
494+
495+    h->opencl.lookahead_program = x264_opencl_compile( h );
496+    if( !h->opencl.lookahead_program )
497+    {
498+        x264_opencl_free_lookahead( h );
499+        return -1;
500+    }
501+
502+    for( int i = 0; i < sizeof(kernelnames)/sizeof(char*); i++ )
503+    {
504+        *kernels[i] = clCreateKernel( h->opencl.lookahead_program, kernelnames[i], &status );
505+        if( status != CL_SUCCESS )
506+        {
507+            x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to compile kernel '%s' (%d)\n", kernelnames[i], status );
508+            x264_opencl_free_lookahead( h );
509+            return -1;
510+        }
511+    }
512+
513+    h->opencl.page_locked_buffer = clCreateBuffer( h->opencl.context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status );
514+    if( status != CL_SUCCESS )
515+    {
516+        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to allocate page-locked buffer, error '%d'\n", status );
517+        return -1;
518+    }
519+    h->opencl.page_locked_ptr = clEnqueueMapBuffer( h->opencl.queue, h->opencl.page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
520+                                                    0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status );
521+    if( status != CL_SUCCESS )
522+    {
523+        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to map page-locked buffer, error '%d'\n", status );
524+        return -1;
525+    }
526+
527+    return 0;
528+}
529+
530+static void x264_opencl_error_notify( const char *errinfo, const void *private_info, size_t cb, void *user_data )
531+{
532+    /* Any error notification can be assumed to be fatal to the OpenCL context.
533+     * We need to stop using it immediately and switch over to the CPU for
534+     * lookahead
535+     */
536+    x264_t *h = (x264_t*)user_data;
537+    h->param.b_opencl = 0;
538+    x264_log( h, X264_LOG_ERROR, "OpenCL: %s\n", errinfo );
539+}
540+
541+int x264_opencl_init( x264_t *h )
542+{
543+    cl_int status;
544+    cl_uint numPlatforms;
545+    int ret = -1;
546+
547+    status = clGetPlatformIDs( 0, NULL, &numPlatforms );
548+    if( status != CL_SUCCESS || numPlatforms == 0 )
549+    {
550+        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n");
551+        return -1;
552+    }
553+
554+    cl_platform_id *platforms = (cl_platform_id*)x264_malloc( numPlatforms * sizeof(cl_platform_id) );
555+    status = clGetPlatformIDs( numPlatforms, platforms, NULL );
556+    if( status != CL_SUCCESS )
557+    {
558+        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n");
559+        x264_free( platforms );
560+        return -1;
561+    }
562+
563+    /* Select the first OpenCL platform that supports a GPU device that supports our
564+     * required image (texture) formats
565+     */
566+    for( cl_uint i = 0; i < numPlatforms; ++i )
567+    {
568+        cl_uint gpu_count;
569+        status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &gpu_count );
570+        if( status == CL_SUCCESS && gpu_count > 0 )
571+        {
572+            /* take GPU 0 */
573+            status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 1, &h->opencl.device, NULL );
574+            if( status != CL_SUCCESS )
575+                continue;
576+
577+            h->opencl.context = clCreateContext( NULL, 1, &h->opencl.device, (void*)x264_opencl_error_notify, (void*)h, &status );
578+            if( status != CL_SUCCESS )
579+                continue;
580+
581+            cl_bool image_support;
582+            clGetDeviceInfo( h->opencl.device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL );
583+            if( !image_support )
584+                continue;
585+
586+#define MAX_IMAGE_TYPES 100
587+            cl_uint count = 0;
588+            cl_image_format imageType[MAX_IMAGE_TYPES];
589+            clGetSupportedImageFormats( h->opencl.context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, MAX_IMAGE_TYPES, imageType, &count );
590+            count = X264_MIN( count, MAX_IMAGE_TYPES );
591+
592+            int b_has_r = 0;
593+            int b_has_rgba = 0;
594+            for( cl_uint j = 0; j < count; j++ )
595+            {
596+                if( imageType[j].image_channel_order == CL_R )
597+                    b_has_r = 1;
598+                else if( imageType[j].image_channel_order == CL_RGBA )
599+                    b_has_rgba = 1;
600+            }
601+            if( !b_has_r || !b_has_rgba )
602+                continue;
603+
604+            h->opencl.queue = clCreateCommandQueue( h->opencl.context, h->opencl.device, 0, &status );
605+            if( status != CL_SUCCESS )
606+                continue;
607+
608+            ret = 0;
609+            break;
610+        }
611+    }
612+
613+    x264_free( platforms );
614+
615+    if( !h->param.psz_clbin_file )
616+        h->param.psz_clbin_file = "x264_lookahead.clbin";
617+
618+    if( !ret )
619+        ret = x264_opencl_init_lookahead( h );
620+
621+    return ret;
622+}
623+
624+void x264_opencl_frame_delete( x264_frame_t *frame )
625+{
626+#define RELEASEBUF(mem) if( mem ) clReleaseMemObject( mem );
627+    for( int j = 0; j < NUM_IMAGE_SCALES; j++ )
628+        RELEASEBUF( frame->opencl.scaled_image2Ds[j] );
629+    RELEASEBUF( frame->opencl.luma_hpel );
630+    RELEASEBUF( frame->opencl.inv_qscale_factor );
631+    RELEASEBUF( frame->opencl.intra_cost );
632+    RELEASEBUF( frame->opencl.lowres_mvs0 );
633+    RELEASEBUF( frame->opencl.lowres_mvs1 );
634+    RELEASEBUF( frame->opencl.lowres_mv_costs0 );
635+    RELEASEBUF( frame->opencl.lowres_mv_costs1 );
636+#undef RELEASEBUF
637+}
638+
639+void x264_opencl_free( x264_t *h )
640+{
641+    x264_opencl_free_lookahead( h );
642+
643+    if( h->opencl.queue )
644+        clReleaseCommandQueue( h->opencl.queue );
645+    if( h->opencl.context )
646+        clReleaseContext( h->opencl.context );
647+}
648+#endif /* HAVE_OPENCL */
649diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl.h
650--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
651+++ b/common/opencl.h	Mon Aug 20 22:33:23 2012 -0500
652@@ -0,0 +1,103 @@
653+/*****************************************************************************
654
655+ * opencl.h: OpenCL structures and defines
656
657+ *****************************************************************************
658
659+ * Copyright (C) 2012 x264 project
660
661+ *
662
663+ * Authors: Steve Borho <sborho@multicorewareinc.com>
664
665+ *
666
667+ * This program is free software; you can redistribute it and/or modify
668
669+ * it under the terms of the GNU General Public License as published by
670
671+ * the Free Software Foundation; either version 2 of the License, or
672
673+ * (at your option) any later version.
674
675+ *
676
677+ * This program is distributed in the hope that it will be useful,
678
679+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
680
681+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
682
683+ * GNU General Public License for more details.
684
685+ *
686
687+ * You should have received a copy of the GNU General Public License
688
689+ * along with this program; if not, write to the Free Software
690
691+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
692
693+ *
694
695+ * This program is also available under a commercial proprietary license.
696
697+ * For more information, contact us at licensing@x264.com.
698
699+ *****************************************************************************/
700
701+
702
703+#ifndef X264_OPENCL_H
704
705+#define X264_OPENCL_H
706
707+
708
709+#include "common/common.h"
710
711+
712
713+#define NUM_IMAGE_SCALES 4
714
715+
716
717+#define MAX_FINISH_COPIES 1024
718
719+#define PAGE_LOCKED_BUF_SIZE 32 * 1024 * 1024
720
721+
722
723+typedef struct
724
725+{
726
727+	cl_context       context;
728
729+	cl_device_id     device;
730
731+	cl_command_queue queue;
732
733+
734
735+	cl_program       lookahead_program;
736
737+	cl_int           last_buf;
738
739+
740
741+	cl_mem           page_locked_buffer;
742
743+	char            *page_locked_ptr;
744
745+	int              pl_occupancy;
746
747+
748
749+	struct
750
751+	{
752
753+		void *src;
754
755+		void *dest;
756
757+		int   bytes;
758
759+	} copies[MAX_FINISH_COPIES];
760
761+	int         num_copies;
762
763+
764
765+	int         b_device_AMD_SI;
766
767+
768
769+	/* downscale lowres luma */
770
771+	cl_kernel   downscale_hpel_kernel;
772
773+	cl_kernel   downscale_kernel1;
774
775+	cl_kernel   downscale_kernel2;
776
777+	cl_mem      luma_16x16_image[2];
778
779+
780
781+	/* weightp filtering */
782
783+	cl_kernel   weightp_hpel_kernel;
784
785+	cl_kernel   weightp_scaled_images_kernel;
786
787+	cl_mem      weighted_scaled_images[NUM_IMAGE_SCALES];
788
789+	cl_mem      weighted_luma_hpel;
790
791+
792
793+	/* intra */
794
795+	cl_kernel   memset_kernel;
796
797+	cl_kernel   intra_kernel;
798
799+	cl_kernel   rowsum_intra_kernel;
800
801+	cl_mem      row_satds[2];
802
803+
804
805+	/* hierarchical motion estimation */
806
807+	cl_kernel   hme_kernel;
808
809+	cl_kernel   subpel_refine_kernel;
810
811+	cl_mem      mv_buffers[2];
812
813+	cl_mem      lowres_mv_costs;
814
815+	cl_mem      mvp_buffer;
816
817+
818
819+	/* bidir */
820
821+	cl_kernel   mode_select_kernel;
822
823+	cl_kernel   rowsum_inter_kernel;
824
825+	cl_mem      lowres_costs[2];
826
827+	cl_mem      frame_stats[2]; /* cost_est, cost_est_aq, intra_mbs */
828
829+} x264_opencl_t;
830
831+
832
833+typedef struct
834
835+{
836
837+	cl_mem scaled_image2Ds[NUM_IMAGE_SCALES];
838
839+	cl_mem luma_hpel;
840
841+	cl_mem inv_qscale_factor;
842
843+	cl_mem intra_cost;
844
845+	cl_mem lowres_mvs0;
846
847+	cl_mem lowres_mvs1;
848
849+	cl_mem lowres_mv_costs0;
850
851+	cl_mem lowres_mv_costs1;
852
853+} x264_frame_opencl_t;
854
855+
856
857+#endif
858
859diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl/bidir.cl
860--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
861+++ b/common/opencl/bidir.cl	Mon Aug 20 22:33:23 2012 -0500
862@@ -0,0 +1,259 @@
863+
864+/* Four threads cooperatively measure 8x8 BIDIR cost with SATD */
865+int bidir_satd_8x8_ii_coop4(
866+    read_only image2d_t fenc_lowres,  int2 fencpos,
867+    read_only image2d_t fref0_planes, int2 qpos0,
868+    read_only image2d_t fref1_planes, int2 qpos1,
869+    int weight,
870+    local sum2_t *tmpp,
871+    int idx )
872+{
873+    volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
874+    sum2_t b0, b1, b2, b3;
875+    sum2_t sum = 0;
876+
877+    // fencpos is full-pel position of original MB
878+    // qpos0 is qpel position within reference frame 0
879+    // qpos1 is qpel position within reference frame 1
880+
881+    int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2);
882+    int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2);
883+
884+    int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1));
885+    int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2);
886+    int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2);
887+
888+    int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2);
889+    int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2);
890+
891+    int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1));
892+    int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2);
893+    int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2);
894+
895+    uint mask_shift0A = 8 * hpel0A, mask_shift0B = 8 * hpel0B;
896+    uint mask_shift1A = 8 * hpel1A, mask_shift1B = 8 * hpel1B;
897+
898+    uint vA, vB;
899+    uint enc, ref0, ref1;
900+    uint a0, a1;
901+    const int weight2 = 64 - weight;
902+
903+#define READ_BIDIR_DIFF( OUT, X )\
904+    enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\
905+    vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 >> mask_shift0A) & 0xFF;\
906+    vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 >> mask_shift0B) & 0xFF;\
907+    ref0 = rhadd( vA, vB );\
908+    vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 >> mask_shift1A) & 0xFF;\
909+    vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 >> mask_shift1B) & 0xFF;\
910+    ref1 = rhadd( vA, vB );\
911+    OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6);
912+
913+#define READ_DIFF_EX( OUT, a, b )\
914+    READ_BIDIR_DIFF( a0, a );\
915+    READ_BIDIR_DIFF( a1, b );\
916+    OUT = a0 + (a1<<BITS_PER_SUM);
917+
918+#define ROW_8x4_SATD( a, b, c )\
919+    fencpos.y += a;\
920+    fref0Apos.y += b;\
921+    fref0Bpos.y += b;\
922+    fref1Apos.y += c;\
923+    fref1Bpos.y += c;\
924+    READ_DIFF_EX( b0, 0, 4 );\
925+    READ_DIFF_EX( b1, 1, 5 );\
926+    READ_DIFF_EX( b2, 2, 6 );\
927+    READ_DIFF_EX( b3, 3, 7 );\
928+    HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
929+    HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
930+    sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );
931+
932+    ROW_8x4_SATD( 0, 0, 0 );
933+    ROW_8x4_SATD( 4, 4, 4 );
934+
935+#undef READ_BIDIR_DIFF
936+#undef READ_DIFF_EX
937+#undef ROW_8x4_SATD
938+
939+    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
940+}
941+
942+/*
943+ * mode selection - pick the least cost partition type for each 8x8 macroblock.
944+ * Intra, list0 or list1.  If encoding a B frame, test three bidir
945+ * possibilities.
946+ *
947+ * fenc_lowres_mvs[0|1] and fenc_lowres_mv_costs[0|1] are large buffers that
948+ * hold many frames worth of motion vectors.  We must offset into the correct
949+ * location for this frame's vectors:
950+ *
951+ *   CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
952+ *   GPU equivalent: fenc_lowres_mvs0[(b - p0 - 1) * mb_count]
953+ *
954+ * global launch dimensions:  [mb_width, mb_height]
955+ *
956+ * If this is a B frame, launch dims are [mb_width * 4, mb_height]
957+ */
958+kernel void mode_selection(
959+    read_only image2d_t   fenc_lowres,
960+    read_only image2d_t   fref0_planes,
961+    read_only image2d_t   fref1_planes,
962+    const global short2  *fenc_lowres_mvs0,
963+    const global short2  *fenc_lowres_mvs1,
964+    const global short2  *fref1_lowres_mvs0,
965+    const global int16_t *fenc_lowres_mv_costs0,
966+    const global int16_t *fenc_lowres_mv_costs1,
967+    const global uint16_t *fenc_intra_cost,
968+    global uint16_t      *lowres_costs,
969+    global int           *frame_stats,
970+    local int16_t        *cost_local,
971+    local sum2_t         *satd_local,
972+    int                   mb_width,
973+    int                   bipred_weight,
974+    int                   dist_scale_factor,
975+    int                   b,
976+    int                   p0,
977+    int                   p1,
978+    int                   lambda )
979+{
980+    int mb_x = get_global_id( 0 );
981+    int b_bidir = b < p1;
982+    if( b_bidir )
983+    {
984+        mb_x >>= 2;
985+        if( mb_x >= mb_width )
986+            return;
987+    }
988+    int mb_y = get_global_id( 1 );
989+    int mb_height = get_global_size( 1 );
990+    int mb_count = mb_width * mb_height;
991+    int mb_xy = mb_x + mb_y * mb_width;
992+
993+    /* Initialize int frame_stats[4] for next kernel (sum_inter_cost) */
994+    if( mb_x < 4 && mb_y == 0 )
995+        frame_stats[mb_x] = 0;
996+
997+    int bcost = COST_MAX;
998+    int list_used = 0;
999+
1000+    if( !b_bidir )
1001+    {
1002+        int icost = fenc_intra_cost[mb_xy];
1003+        COPY2_IF_LT( bcost, icost, list_used, 0 );
1004+    }
1005+    if( b != p0 )
1006+    {
1007+        int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy];
1008+        COPY2_IF_LT( bcost, mv_cost0, list_used, 1 );
1009+    }
1010+    if( b != p1 )
1011+    {
1012+        int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy];
1013+        COPY2_IF_LT( bcost, mv_cost1, list_used, 2 );
1014+    }
1015+
1016+    if( b_bidir )
1017+    {
1018+        int2 coord = (int2)(mb_x << 3, mb_y << 3);
1019+        int mb_i = get_global_id( 0 ) & 3;
1020+        int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
1021+        cost_local += mb_in_group * 4;
1022+        satd_local += mb_in_group * 16;
1023+
1024+#define TRY_BIDIR( mv0, mv1, penalty )\
1025+    {\
1026+        int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\
1027+        int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\
1028+        cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\
1029+        int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
1030+        COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\
1031+    }
1032+        /* temporal prediction */
1033+        short2 dmv0, dmv1;
1034+        short2 mvr = fref1_lowres_mvs0[mb_xy];
1035+        dmv0.x = (mvr.x * dist_scale_factor + 128) >> 8;
1036+        dmv0.y = (mvr.y * dist_scale_factor + 128) >> 8;
1037+        dmv1.x = dmv0.x - mvr.x;
1038+        dmv1.y = dmv0.y - mvr.y;
1039+        TRY_BIDIR( dmv0, dmv1, 0 )
1040+
1041+        if( as_uint( dmv0 ) || as_uint( dmv1 ) )
1042+        {
1043+            /* B-direct prediction */
1044+            dmv0 = 0; dmv1 = 0;
1045+            TRY_BIDIR( dmv0, dmv1, 0 );
1046+        }
1047+
1048+        /* L0+L1 prediction */
1049+        dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy];
1050+        dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy];
1051+        TRY_BIDIR( dmv0, dmv1, 5 );
1052+#undef TRY_BIDIR
1053+    }
1054+
1055+    lowres_costs[mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
1056+}
1057+
1058+/*
1059+ * parallel sum inter costs
1060+ *
1061+ * global launch dimensions: [256, mb_height]
1062+ */
1063+kernel void sum_inter_cost(
1064+    const global uint16_t *fenc_lowres_costs,
1065+    const global uint16_t *inv_qscale_factor,
1066+    global int           *fenc_row_satds,
1067+    global int           *frame_stats,
1068+    int                   mb_width,
1069+    int                   bframe_bias,
1070+    int                   b,
1071+    int                   p0,
1072+    int                   p1 )
1073+{
1074+    int y = get_global_id( 1 );
1075+    int mb_height = get_global_size( 1 );
1076+
1077+    int row_satds = 0;
1078+    int cost_est = 0;
1079+    int cost_est_aq = 0;
1080+    int intra_mbs = 0;
1081+
1082+    for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
1083+    {
1084+        int mb_xy = x + y * mb_width;
1085+        int cost = fenc_lowres_costs[mb_xy] & LOWRES_COST_MASK;
1086+        int list = fenc_lowres_costs[mb_xy] >> LOWRES_COST_SHIFT;
1087+        int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
1088+
1089+        if( list == 0 && b_frame_score_mb )
1090+            intra_mbs++;
1091+
1092+        int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
1093+
1094+        row_satds += cost_aq;
1095+
1096+        if( b_frame_score_mb )
1097+        {
1098+            cost_est += cost;
1099+            cost_est_aq += cost_aq;
1100+        }
1101+    }
1102+
1103+    local int buffer[256];
1104+    int x = get_global_id( 0 );
1105+
1106+    row_satds   = parallel_sum( row_satds, x, buffer );
1107+    cost_est    = parallel_sum( cost_est, x, buffer );
1108+    cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
1109+    intra_mbs   = parallel_sum( intra_mbs, x, buffer );
1110+
1111+    if( b != p1 )
1112+        cost_est = (int)((float)cost_est * 100 / (120 + bframe_bias));
1113+
1114+    if( get_global_id( 0 ) == 0 )
1115+    {
1116+        fenc_row_satds[y] = row_satds;
1117+        atomic_add( frame_stats + COST_EST, cost_est );
1118+        atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
1119+        atomic_add( frame_stats + INTRA_MBS, intra_mbs );
1120+    }
1121+}
1122diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl/downscale.cl
1123--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
1124+++ b/common/opencl/downscale.cl	Mon Aug 20 22:33:23 2012 -0500
1125@@ -0,0 +1,136 @@
1126+/*
1127+ * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image
1128+ *
1129+ * --
1130+ *
1131+ * fenc_img is an output image (area of memory referenced through a texture
1132+ * cache). A read of any pixel location (x,y) returns four pixel values:
1133+ *
1134+ * val.s0 = P(x,y)
1135+ * val.s1 = P(x+1,y)
1136+ * val.s2 = P(x+2,y)
1137+ * val.s3 = P(x+3,y)
1138+ *
1139+ * This is a 4x replication of the lowres pixels, a trade-off between memory
1140+ * size and read latency.
1141+ *
1142+ * --
1143+ *
1144+ * hpel_planes is an output image that contains the four HPEL planes used for
1145+ * subpel refinement. A read of any pixel location (x,y) returns a UInt32 with
1146+ * the four planar values C | V | H | F
1147+ *
1148+ * launch dimensions:  [lowres-width, lowres-height]
1149+ */
1150+kernel void downscale_hpel(
1151+    const global pixel *fenc,
1152+    write_only image2d_t fenc_img,
1153+    write_only image2d_t hpel_planes,
1154+    int stride )
1155+{
1156+    int x = get_global_id( 0 );
1157+    int y = get_global_id( 1 );
1158+    uint4 values;
1159+
1160+    fenc += y * stride * 2;
1161+    const global pixel *src1 = fenc + stride;
1162+    const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride;
1163+    int2 pos = (int2)(x, y);
1164+    pixel right, left;
1165+
1166+    right = rhadd( fenc[x*2], src1[x*2] );
1167+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
1168+    values.s0 = rhadd( right, left );           // F
1169+
1170+    right = rhadd( fenc[2*x+1], src1[2*x+1] );
1171+    left  = rhadd( fenc[2*x+2], src1[2*x+2] );
1172+    values.s1 = rhadd( right, left );           // H
1173+
1174+    right = rhadd( src1[2*x], src2[2*x] );
1175+    left  = rhadd( src1[2*x+1], src2[2*x+1] );
1176+    values.s2 = rhadd( right, left );           // V
1177+
1178+    right = rhadd( src1[2*x+1], src2[2*x+1] );
1179+    left  = rhadd( src1[2*x+2], src2[2*x+2] );
1180+    values.s3 = rhadd( right, left );           // C
1181+
1182+    uint4 val = (uint4) ((values.s3 & 0xff) << 24) | ((values.s2 & 0xff) << 16) | ((values.s1 & 0xff) << 8) | (values.s0 & 0xff);
1183+    write_imageui( hpel_planes, pos, val );
1184+
1185+    x = x+1 < get_global_size( 0 ) ? x+1 : x;
1186+    right = rhadd( fenc[x*2], src1[x*2] );
1187+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
1188+    values.s1 = rhadd( right, left );
1189+
1190+    x = x+1 < get_global_size( 0 ) ? x+1 : x;
1191+    right = rhadd( fenc[x*2], src1[x*2] );
1192+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
1193+    values.s2 = rhadd( right, left );
1194+
1195+    x = x+1 < get_global_size( 0 ) ? x+1 : x;
1196+    right = rhadd( fenc[x*2], src1[x*2] );
1197+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
1198+    values.s3 = rhadd( right, left );
1199+
1200+    write_imageui( fenc_img, pos, values );
1201+}
1202+
1203+/*
1204+ * downscale lowres hierarchical motion search image, copy from one image to
1205+ * another decimated image.  This kernel is called iteratively to generate all
1206+ * of the downscales.
1207+ *
1208+ * launch dimensions:  [lower_res width, lower_res height]
1209+ */
1210+kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res )
1211+{
1212+    int x = get_global_id( 0 );
1213+    int y = get_global_id( 1 );
1214+    int2 pos = (int2)(x, y);
1215+    int gs = get_global_size( 0 );
1216+    uint4 top, bot, values;
1217+    top = read_imageui( higher_res, sampler, (int2)( x*2, 2*y ) );
1218+    bot = read_imageui( higher_res, sampler, (int2)( x*2, 2*y+1 ) );
1219+    values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
1220+
1221+    /* these select statements appear redundant, and they should be, but tests break when
1222+     * they are not here.  I believe this was caused by a driver bug
1223+     */
1224+    values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
1225+    top = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y ) );
1226+    bot = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y+1 ) );
1227+    values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
1228+    values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
1229+    write_imageui( lower_res, pos, (uint4)(values) );
1230+}
1231+
1232+/*
1233+ * Second copy of downscale kernel, no differences. This is a (no perf loss)
1234+ * workaround for a scheduling bug in current Tahiti drivers.  This bug has
1235+ * theoretically been fixed in the July 2012 driver release from AMD.
1236+ */
1237+kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res )
1238+{
1239+    int x = get_global_id( 0 );
1240+    int y = get_global_id( 1 );
1241+    int2 pos = (int2)(x, y);
1242+    int gs = get_global_size( 0 );
1243+    uint4 top, bot, values;
1244+    top = read_imageui( higher_res, sampler, (int2)( x*2, 2*y ) );
1245+    bot = read_imageui( higher_res, sampler, (int2)( x*2, 2*y+1 ) );
1246+    values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
1247+
1248+    // see comment in above function copy
1249+    values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
1250+    top = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y ) );
1251+    bot = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y+1 ) );
1252+    values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
1253+    values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
1254+    write_imageui( lower_res, pos, (uint4)(values) );
1255+}
1256+
1257+/* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */
1258+kernel void memset_int16( global int16_t *buf, int16_t value )
1259+{
1260+    buf[get_global_id( 0 )] = value;
1261+}
1262diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl/intra.cl
1263--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
1264+++ b/common/opencl/intra.cl	Mon Aug 20 22:33:23 2012 -0500
1265@@ -0,0 +1,1073 @@
1266+/* Lookahead lowres intra analysis
1267+ *
1268+ * Each intra analysis function has been implemented twice, once for scalar GPUs
1269+ * (NV) and once for vectorized GPUs (AMD pre-Southern Islands).  x264 detects
1270+ * the GPU type and sets the -DVECTORIZE compile flag accordingly.
1271+ *
1272+ * All the intra analysis functions were based on their C versions.
1273+ */
1274+
1275+#if VECTORIZE
1276+int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 )
1277+{
1278+    int8 a_v, d_v;
1279+    int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13;
1280+    int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33;
1281+
1282+    d_v = convert_int8( vload8( 0, data ) );
1283+    a_v.s01234567 = (d_v - pr0).s04152637;
1284+    HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
1285+
1286+    data += data_stride;
1287+    d_v = convert_int8( vload8( 0, data ) );
1288+    a_v.s01234567 = (d_v - pr1).s04152637;
1289+    HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
1290+
1291+    data += data_stride;
1292+    d_v = convert_int8( vload8( 0, data ) );
1293+    a_v.s01234567 = (d_v - pr2).s04152637;
1294+    HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
1295+
1296+    data += data_stride;
1297+    d_v = convert_int8( vload8( 0, data ) );
1298+    a_v.s01234567 = (d_v - pr3).s04152637;
1299+    HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
1300+
1301+    uint8 sum_v;
1302+
1303+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 );
1304+    sum_v = abs( a_v );
1305+
1306+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 );
1307+    sum_v += abs( a_v );
1308+
1309+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 );
1310+    sum_v += abs( a_v );
1311+
1312+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 );
1313+    sum_v += abs( a_v );
1314+
1315+    uint4 sum2 = sum_v.hi + sum_v.lo;
1316+    uint2 sum3 = sum2.hi + sum2.lo;
1317+    return ( sum3.hi + sum3.lo ) >> 1;
1318+}
1319+#else
1320+SATD_C_8x4_Q( satd_8x4_lp, const local, private )
1321+#endif
1322+
1323+/****************************************************************************
1324+ * 8x8 prediction for intra luma block
1325+ ****************************************************************************/
1326+
1327+#define F1            rhadd
1328+#define F2( a, b, c ) ( a+2*b+c+2 )>>2
1329+
1330+#if VECTORIZE
1331+int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
1332+{
1333+    int8 pr0, pr1, pr2, pr3;
1334+
1335+    // Upper half of pred[]
1336+    pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2;
1337+    pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
1338+    pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
1339+    pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
1340+    pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
1341+    pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
1342+    pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
1343+    pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1344+
1345+    pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
1346+    pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
1347+    pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
1348+    pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
1349+    pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
1350+    pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
1351+    pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1352+    pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
1353+
1354+    pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
1355+    pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
1356+    pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
1357+    pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
1358+    pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
1359+    pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1360+    pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
1361+    pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
1362+
1363+    pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
1364+    pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
1365+    pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
1366+    pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
1367+    pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1368+    pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
1369+    pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
1370+    pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
1371+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
1372+
1373+    // Lower half of pred[]
1374+    pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
1375+    pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
1376+    pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
1377+    pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1378+    pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
1379+    pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
1380+    pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
1381+    pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
1382+
1383+    pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
1384+    pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
1385+    pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1386+    pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
1387+    pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
1388+    pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
1389+    pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
1390+    pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
1391+
1392+    pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
1393+    pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1394+    pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
1395+    pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
1396+    pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
1397+    pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
1398+    pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
1399+    pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
1400+
1401+    pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
1402+    pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
1403+    pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
1404+    pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
1405+    pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
1406+    pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
1407+    pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
1408+    pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2;
1409+
1410+    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
1411+}
1412+
1413+int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
1414+{
1415+    int8 pr0, pr1, pr2, pr3;
1416+
1417+    // Upper half of pred[]
1418+    pr3.s0 = F2( left[1], left[2], left[3] );
1419+    pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] );
1420+    pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top );
1421+    pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
1422+    pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
1423+    pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
1424+    pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
1425+    pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
1426+    pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] );
1427+    pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] );
1428+    pr0.s7 = F2( top[5], top[6], top[7] );
1429+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
1430+
1431+    // Lower half of pred[]
1432+    pr3.s0 = F2( left[5], left[6], left[7] );
1433+    pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] );
1434+    pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] );
1435+    pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] );
1436+    pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] );
1437+    pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] );
1438+    pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top );
1439+    pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] );
1440+    pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] );
1441+    pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] );
1442+    pr0.s7 = F2( top[1], top[2], top[3] );
1443+    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
1444+}
1445+
1446+int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
1447+{
1448+    int8 pr0, pr1, pr2, pr3;
1449+
1450+    // Upper half of pred[]
1451+    pr2.s0 = F2( left[1], left[0], left_top );
1452+    pr3.s0 = F2( left[2], left[1], left[0] );
1453+    pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] );
1454+    pr0.s0 = pr2.s1 = F1( left_top, top[0] );
1455+    pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] );
1456+    pr0.s1 = pr2.s2 = F1( top[0], top[1] );
1457+    pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] );
1458+    pr0.s2 = pr2.s3 = F1( top[1], top[2] );
1459+    pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] );
1460+    pr0.s3 = pr2.s4 = F1( top[2], top[3] );
1461+    pr1.s4 = pr3.s5 = F2( top[2], top[3], top[4] );
1462+    pr0.s4 = pr2.s5 = F1( top[3], top[4] );
1463+    pr1.s5 = pr3.s6 = F2( top[3], top[4], top[5] );
1464+    pr0.s5 = pr2.s6 = F1( top[4], top[5] );
1465+    pr1.s6 = pr3.s7 = F2( top[4], top[5], top[6] );
1466+    pr0.s6 = pr2.s7 = F1( top[5], top[6] );
1467+    pr1.s7 = F2( top[5], top[6], top[7] );
1468+    pr0.s7 = F1( top[6], top[7] );
1469+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
1470+
1471+    // Lower half of pred[]
1472+    pr2.s0 = F2( left[5], left[4], left[3] );
1473+    pr3.s0 = F2( left[6], left[5], left[4] );
1474+    pr0.s0 = pr2.s1 = F2( left[3], left[2], left[1] );
1475+    pr1.s0 = pr3.s1 = F2( left[4], left[3], left[2] );
1476+    pr0.s1 = pr2.s2 = F2( left[1], left[0], left_top );
1477+    pr1.s1 = pr3.s2 = F2( left[2], left[1], left[0] );
1478+    pr1.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
1479+    pr0.s2 = pr2.s3 = F1( left_top, top[0] );
1480+    pr1.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
1481+    pr0.s3 = pr2.s4 = F1( top[0], top[1] );
1482+    pr1.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
1483+    pr0.s4 = pr2.s5 = F1( top[1], top[2] );
1484+    pr1.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
1485+    pr0.s5 = pr2.s6 = F1( top[2], top[3] );
1486+    pr1.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
1487+    pr0.s6 = pr2.s7 = F1( top[3], top[4] );
1488+    pr1.s7 = F2( top[3], top[4], top[5] );
1489+    pr0.s7 = F1( top[4], top[5] );
1490+    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
1491+#undef PRED
1492+}
1493+
1494+int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
1495+{
1496+    int8 pr0, pr1, pr2, pr3;
1497+
1498+    // Upper half of pred[]
1499+    pr0.s0 = F1( left_top, left[0] ); pr0.s1 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
1500+    pr0.s2 = F2( top[1], top[0], left_top ); pr0.s3 = F2( top[2], top[1], top[0] );
1501+    pr0.s4 = F2( top[3], top[2], top[1] ); pr0.s5 = F2( top[4], top[3], top[2] );
1502+    pr0.s6 = F2( top[5], top[4], top[3] ); pr0.s7 = F2( top[6], top[5], top[4] );
1503+
1504+    pr1.s0 = F1( left[0], left[1] ); pr1.s1 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
1505+    pr1.s2 = F1( left_top, left[0] ); pr1.s3 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
1506+    pr1.s4 = F2( top[1], top[0], left_top ); pr1.s5 = F2( top[2], top[1], top[0] );
1507+    pr1.s6 = F2( top[3], top[2], top[1] ); pr1.s7 = F2( top[4], top[3], top[2] );
1508+
1509+    pr2.s0 = F1( left[1], left[2] ); pr2.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
1510+    pr2.s2 = F1( left[0], left[1] ); pr2.s3 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
1511+    pr2.s4 = F1( left_top, left[0] ); pr2.s5 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
1512+    pr2.s6 = F2( top[1], top[0], left_top ); pr2.s7 = F2( top[2], top[1], top[0] );
1513+
1514+    pr3.s0 = F1( left[2], left[3] ); pr3.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
1515+    pr3.s2 = F1( left[1], left[2] ); pr3.s3 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
1516+    pr3.s4 = F1( left[0], left[1] ); pr3.s5 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
1517+    pr3.s6 = F1( left_top, left[0] ); pr3.s7 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
1518+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
1519+
1520+    // Lower half of pred[]
1521+    pr0.s0 = F1( left[3], left[4] ); pr0.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
1522+    pr0.s2 = F1( left[2], left[3] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
1523+    pr0.s4 = F1( left[1], left[2] ); pr0.s5 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
1524+    pr0.s6 = F1( left[0], left[1] ); pr0.s7 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
1525+
1526+    pr1.s0 = F1( left[4], left[5] ); pr1.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
1527+    pr1.s2 = F1( left[3], left[4] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
1528+    pr1.s4 = F1( left[2], left[3] ); pr1.s5 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
1529+    pr1.s6 = F1( left[1], left[2] ); pr1.s7 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
1530+
1531+    pr2.s0 = F1( left[5], left[6] ); pr2.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
1532+    pr2.s2 = F1( left[4], left[5] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
1533+    pr2.s4 = F1( left[3], left[4] ); pr2.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
1534+    pr2.s6 = F1( left[2], left[3] ); pr2.s7 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
1535+
1536+    pr3.s0 = F1( left[6], left[7] ); pr3.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
1537+    pr3.s2 = F1( left[5], left[6] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
1538+    pr3.s4 = F1( left[4], left[5] ); pr3.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
1539+    pr3.s6 = F1( left[3], left[4] ); pr3.s7 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
1540+    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
1541+}
1542+
1543+int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
1544+{
1545+    int8 pr0, pr1, pr2, pr3;
1546+
1547+    // Upper half of pred[]
1548+    pr0.s0 = F1( top[0], top[1] );
1549+    pr1.s0 = F2( top[0], top[1], top[2] );
1550+    pr2.s0 = pr0.s1 = F1( top[1], top[2] );
1551+    pr3.s0 = pr1.s1 = F2( top[1], top[2], top[3] );
1552+    pr2.s1 = pr0.s2 = F1( top[2], top[3] );
1553+    pr3.s1 = pr1.s2 = F2( top[2], top[3], top[4] );
1554+    pr2.s2 = pr0.s3 = F1( top[3], top[4] );
1555+    pr3.s2 = pr1.s3 = F2( top[3], top[4], top[5] );
1556+    pr2.s3 = pr0.s4 = F1( top[4], top[5] );
1557+    pr3.s3 = pr1.s4 = F2( top[4], top[5], top[6] );
1558+    pr2.s4 = pr0.s5 = F1( top[5], top[6] );
1559+    pr3.s4 = pr1.s5 = F2( top[5], top[6], top[7] );
1560+    pr2.s5 = pr0.s6 = F1( top[6], top[7] );
1561+    pr3.s5 = pr1.s6 = F2( top[6], top[7], top[8] );
1562+    pr2.s6 = pr0.s7 = F1( top[7], top[8] );
1563+    pr3.s6 = pr1.s7 = F2( top[7], top[8], top[9] );
1564+    pr2.s7 = F1( top[8], top[9] );
1565+    pr3.s7 = F2( top[8], top[9], top[10] );
1566+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
1567+
1568+    // Lower half of pred[]
1569+    pr0.s0 = F1( top[2], top[3] );
1570+    pr1.s0 = F2( top[2], top[3], top[4] );
1571+    pr2.s0 = pr0.s1 = F1( top[3], top[4] );
1572+    pr3.s0 = pr1.s1 = F2( top[3], top[4], top[5] );
1573+    pr2.s1 = pr0.s2 = F1( top[4], top[5] );
1574+    pr3.s1 = pr1.s2 = F2( top[4], top[5], top[6] );
1575+    pr2.s2 = pr0.s3 = F1( top[5], top[6] );
1576+    pr3.s2 = pr1.s3 = F2( top[5], top[6], top[7] );
1577+    pr2.s3 = pr0.s4 = F1( top[6], top[7] );
1578+    pr3.s3 = pr1.s4 = F2( top[6], top[7], top[8] );
1579+    pr2.s4 = pr0.s5 = F1( top[7], top[8] );
1580+    pr3.s4 = pr1.s5 = F2( top[7], top[8], top[9] );
1581+    pr2.s5 = pr0.s6 = F1( top[8], top[9] );
1582+    pr3.s5 = pr1.s6 = F2( top[8], top[9], top[10] );
1583+    pr2.s6 = pr0.s7 = F1( top[9], top[10] );
1584+    pr3.s6 = pr1.s7 = F2( top[9], top[10], top[11] );
1585+    pr2.s7 = F1( top[10], top[11] );
1586+    pr3.s7 = F2( top[10], top[11], top[12] );
1587+    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
1588+}
1589+
1590+int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
1591+{
1592+    int8 pr0, pr1, pr2, pr3;
1593+
1594+    // Upper half of pred[]
1595+    pr0.s0 = F1( left[0], left[1] ); pr0.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
1596+    pr0.s2 = F1( left[1], left[2] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
1597+    pr0.s4 = F1( left[2], left[3] ); pr0.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
1598+    pr0.s6 = F1( left[3], left[4] ); pr0.s7 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
1599+
1600+    pr1.s0 = F1( left[1], left[2] ); pr1.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
1601+    pr1.s2 = F1( left[2], left[3] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
1602+    pr1.s4 = F1( left[3], left[4] ); pr1.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
1603+    pr1.s6 = F1( left[4], left[5] ); pr1.s7 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
1604+
1605+    pr2.s0 = F1( left[2], left[3] ); pr2.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
1606+    pr2.s2 = F1( left[3], left[4] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
1607+    pr2.s4 = F1( left[4], left[5] ); pr2.s5 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
1608+    pr2.s6 = F1( left[5], left[6] ); pr2.s7 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
1609+
1610+    pr3.s0 = F1( left[3], left[4] ); pr3.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
1611+    pr3.s2 = F1( left[4], left[5] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
1612+    pr3.s4 = F1( left[5], left[6] ); pr3.s5 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
1613+    pr3.s6 = F1( left[6], left[7] ); pr3.s7 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
1614+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
1615+
1616+    // Lower half of pred[]
1617+    pr0.s0 = F1( left[4], left[5] ); pr0.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
1618+    pr0.s2 = F1( left[5], left[6] ); pr0.s3 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
1619+    pr0.s4 = F1( left[6], left[7] ); pr0.s5 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
1620+    pr0.s6 = left[7]; pr0.s7 = left[7];
1621+
1622+    pr1.s0 = F1( left[5], left[6] ); pr1.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
1623+    pr1.s2 = F1( left[6], left[7] ); pr1.s3 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
1624+    pr1.s4 = left[7]; pr1.s5 = left[7];
1625+    pr1.s6 = left[7]; pr1.s7 = left[7];
1626+
1627+    pr2.s0 = F1( left[6], left[7] ); pr2.s1 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
1628+    pr2.s2 = left[7]; pr2.s3 = left[7];
1629+    pr2.s4 = left[7]; pr2.s5 = left[7];
1630+    pr2.s6 = left[7]; pr2.s7 = left[7];
1631+
1632+    pr3 = (int8)left[7];
1633+
1634+    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
1635+}
1636+
1637+int x264_predict_8x8c_h( const local pixel *src, int src_stride )
1638+{
1639+    const local pixel *src_l = src;
1640+    int8 pr0, pr1, pr2, pr3;
1641+
1642+    // Upper half of pred[]
1643+    pr0 = (int8)src[-1]; src += src_stride;
1644+    pr1 = (int8)src[-1]; src += src_stride;
1645+    pr2 = (int8)src[-1]; src += src_stride;
1646+    pr3 = (int8)src[-1]; src += src_stride;
1647+    int satd = satd_8x4_intra_lr( src_l, src_stride, pr0, pr1, pr2, pr3 );
1648+
1649+    //Lower half of pred[]
1650+    pr0 = (int8)src[-1]; src += src_stride;
1651+    pr1 = (int8)src[-1]; src += src_stride;
1652+    pr2 = (int8)src[-1]; src += src_stride;
1653+    pr3 = (int8)src[-1];
1654+    return satd + satd_8x4_intra_lr( src_l + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
1655+}
1656+
1657+int x264_predict_8x8c_v( const local pixel *src, int src_stride )
1658+{
1659+    int8 pred = convert_int8( vload8( 0, &src[-src_stride] ));
1660+    return satd_8x4_intra_lr( src, src_stride, pred, pred, pred, pred ) +
1661+           satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pred, pred, pred, pred );
1662+}
1663+
1664+int x264_predict_8x8c_p( const local pixel *src, int src_stride )
1665+{
1666+    int H = 0, V = 0;
1667+    for( int i = 0; i < 4; i++ )
1668+    {
1669+        H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
1670+        V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
1671+    }
1672+
1673+    int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
1674+    int b = (17 * H + 16) >> 5;
1675+    int c = (17 * V + 16) >> 5;
1676+    int i00 = a - 3 * b - 3 * c + 16;
1677+
1678+    // Upper half of pred[]
1679+    int pix = i00;
1680+    int8 pr0, pr1, pr2, pr3;
1681+    pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1682+    pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1683+    pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1684+    pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1685+    pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1686+    pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1687+    pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1688+    pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1689+
1690+    pix = i00;
1691+    pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1692+    pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1693+    pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1694+    pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1695+    pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1696+    pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1697+    pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1698+    pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1699+
1700+    pix = i00;
1701+    pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1702+    pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1703+    pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1704+    pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1705+    pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1706+    pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1707+    pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1708+    pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1709+
1710+    pix = i00;
1711+    pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1712+    pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1713+    pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1714+    pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1715+    pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1716+    pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1717+    pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1718+    pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1719+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
1720+
1721+    //Lower half of pred[]
1722+    pix = i00;
1723+    pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1724+    pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1725+    pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1726+    pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1727+    pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1728+    pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1729+    pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1730+    pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1731+
1732+    pix = i00;
1733+    pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1734+    pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1735+    pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1736+    pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1737+    pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1738+    pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1739+    pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1740+    pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1741+
1742+    pix = i00;
1743+    pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1744+    pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1745+    pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1746+    pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1747+    pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1748+    pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1749+    pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1750+    pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1751+
1752+    pix = i00;
1753+    pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
1754+    pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
1755+    pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
1756+    pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
1757+    pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
1758+    pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
1759+    pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
1760+    pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
1761+    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
1762+}
1763+
1764+int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
1765+{
1766+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
1767+    for( int i = 0; i < 4; i++ )
1768+    {
1769+        s0 += src[i - src_stride];
1770+        s1 += src[i + 4 - src_stride];
1771+        s2 += src[-1 + i * src_stride];
1772+        s3 += src[-1 + (i+4)*src_stride];
1773+    }
1774+
1775+    // Upper half of pred[]
1776+    int8 dc0;
1777+    dc0.lo = (int4)( (s0 + s2 + 4) >> 3 );
1778+    dc0.hi = (int4)( (s1 + 2) >> 2 );
1779+    int satd = satd_8x4_intra_lr( src, src_stride, dc0, dc0, dc0, dc0 );
1780+
1781+    // Lower half of pred[]
1782+    dc0.lo = (int4)( (s3 + 2) >> 2 );
1783+    dc0.hi = (int4)( (s1 + s3 + 4) >> 3 );
1784+    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, dc0, dc0, dc0, dc0 );
1785+}
1786+
1787+#else  /* not vectorized: private is cheap registers are scarce */
1788+
1789+int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
1790+{
1791+    private pixel pred[32];
1792+    int x_plus_y;
1793+
1794+    // Upper half of pred[]
1795+    for( int y = 0; y < 4; y++ )
1796+    {
1797+        for( int x = 0; x < 8; x++ )
1798+        {
1799+            x_plus_y = clamp( x + y, 0, 13 );
1800+            pred[x + y*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
1801+        }
1802+    }
1803+    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
1804+    //Lower half of pred[]
1805+    for( int y = 4; y < 8; y++ )
1806+    {
1807+        for( int x = 0; x < 8; x++ )
1808+        {
1809+            x_plus_y = clamp( x + y, 0, 13 );
1810+            pred[x + ( y - 4 )*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
1811+        }
1812+    }
1813+    pred[31] = ( 2 + top[14] + 3*top[15] ) >> 2;
1814+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
1815+    return satd;
1816+}
1817+
1818+int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
1819+{
1820+    private pixel pred[32];
1821+#define PRED( x, y ) pred[(x) + (y)*8]
1822+    // Upper half of pred[]
1823+    PRED( 0, 3 ) = F2( left[1], left[2], left[3] );
1824+    PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[0], left[1], left[2] );
1825+    PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[1], left[0], left_top );
1826+    PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
1827+    PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
1828+    PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
1829+    PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
1830+    PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
1831+    PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( top[3], top[4], top[5] );
1832+    PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[4], top[5], top[6] );
1833+    PRED( 7, 0 ) = F2( top[5], top[6], top[7] );
1834+    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
1835+
1836+    // Lower half of pred[]
1837+    PRED( 0, 3 ) = F2( left[5], left[6], left[7] );
1838+    PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[4], left[5], left[6] );
1839+    PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[3], left[4], left[5] );
1840+    PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[2], left[3], left[4] );
1841+    PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left[1], left[2], left[3] );
1842+    PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( left[0], left[1], left[2] );
1843+    PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( left[1], left[0], left_top );
1844+    PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( left[0], left_top, top[0] );
1845+    PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( left_top, top[0], top[1] );
1846+    PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[0], top[1], top[2] );
1847+    PRED( 7, 0 ) = F2( top[1], top[2], top[3] );
1848+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
1849+    return satd;
1850+#undef PRED
1851+}
1852+
1853+int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
1854+{
1855+    private pixel pred[32];
1856+#define PRED( x, y ) pred[(x) + (y)*8]
1857+    // Upper half of pred[]
1858+    PRED( 0, 2 ) = F2( left[1], left[0], left_top );
1859+    PRED( 0, 3 ) = F2( left[2], left[1], left[0] );
1860+    PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[0], left_top, top[0] );
1861+    PRED( 0, 0 ) = PRED( 1, 2 ) = F1( left_top, top[0] );
1862+    PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left_top, top[0], top[1] );
1863+    PRED( 1, 0 ) = PRED( 2, 2 ) = F1( top[0], top[1] );
1864+    PRED( 2, 1 ) = PRED( 3, 3 ) = F2( top[0], top[1], top[2] );
1865+    PRED( 2, 0 ) = PRED( 3, 2 ) = F1( top[1], top[2] );
1866+    PRED( 3, 1 ) = PRED( 4, 3 ) = F2( top[1], top[2], top[3] );
1867+    PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[2], top[3] );
1868+    PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[2], top[3], top[4] );
1869+    PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[3], top[4] );
1870+    PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[3], top[4], top[5] );
1871+    PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[4], top[5] );
1872+    PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[4], top[5], top[6] );
1873+    PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[5], top[6] );
1874+    PRED( 7, 1 ) = F2( top[5], top[6], top[7] );
1875+    PRED( 7, 0 ) = F1( top[6], top[7] );
1876+    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
1877+
1878+    //Lower half of pred[]
1879+    PRED( 0, 2 ) = F2( left[5], left[4], left[3] );
1880+    PRED( 0, 3 ) = F2( left[6], left[5], left[4] );
1881+    PRED( 0, 0 ) = PRED( 1, 2 ) = F2( left[3], left[2], left[1] );
1882+    PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[4], left[3], left[2] );
1883+    PRED( 1, 0 ) = PRED( 2, 2 ) = F2( left[1], left[0], left_top );
1884+    PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left[2], left[1], left[0] );
1885+    PRED( 2, 1 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
1886+    PRED( 2, 0 ) = PRED( 3, 2 ) = F1( left_top, top[0] );
1887+    PRED( 3, 1 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
1888+    PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[0], top[1] );
1889+    PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
1890+    PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[1], top[2] );
1891+    PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
1892+    PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[2], top[3] );
1893+    PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
1894+    PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[3], top[4] );
1895+    PRED( 7, 1 ) = F2( top[3], top[4], top[5] );
1896+    PRED( 7, 0 ) = F1( top[4], top[5] );
1897+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
1898+    return satd;
1899+#undef PRED
1900+}
1901+
1902+inline uint32_t pack16to32( uint32_t a, uint32_t b )
1903+{
1904+    return a + (b << 16);
1905+}
1906+
1907+inline uint32_t pack8to16( uint32_t a, uint32_t b )
1908+{
1909+    return a + (b << 8);
1910+}
1911+
1912+int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
1913+{
1914+    private pixel pred[32];
1915+    int satd;
1916+    int p1 =  pack8to16( (F1( left[6], left[7] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
1917+    int p2 =  pack8to16( (F1( left[5], left[6] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
1918+    int p3 =  pack8to16( (F1( left[4], left[5] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
1919+    int p4 =  pack8to16( (F1( left[3], left[4] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
1920+    int p5 =  pack8to16( (F1( left[2], left[3] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
1921+    int p6 =  pack8to16( (F1( left[1], left[2] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
1922+    int p7 =  pack8to16( (F1( left[0], left[1] )), ((left_top + 2 * left[0] + left[1] + 2) >> 2) );
1923+    int p8 =  pack8to16( (F1( left_top, left[0] )), ((left[0] + 2 * left_top + top[0] + 2) >> 2) );
1924+    int p9 =  pack8to16( (F2( top[1], top[0], left_top )), (F2( top[2], top[1], top[0] )) );
1925+    int p10 =  pack8to16( (F2( top[3], top[2], top[1] )), (F2( top[4], top[3], top[2] )) );
1926+    int p11 =  pack8to16( (F2( top[5], top[4], top[3] )), (F2( top[6], top[5], top[4] )) );
1927+    // Upper half of pred[]
1928+    vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[0 + 0 * 8] );
1929+    vstore4( as_uchar4( pack16to32( p10, p11 ) ), 0, &pred[4 + 0 * 8] );
1930+    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[0 + 1 * 8] );
1931+    vstore4( as_uchar4( pack16to32( p9, p10 ) ), 0, &pred[4 + 1 * 8] );
1932+    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[0 + 2 * 8] );
1933+    vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[4 + 2 * 8] );
1934+    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[0 + 3 * 8] );
1935+    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[4 + 3 * 8] );
1936+    satd = satd_8x4_lp( src, src_stride, pred, 8 );
1937+    // Lower half of pred[]
1938+    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[0 + 0 * 8] );
1939+    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[4 + 0 * 8] );
1940+    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[0 + 1 * 8] );
1941+    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[4 + 1 * 8] );
1942+    vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[0 + 2 * 8] );
1943+    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[4 + 2 * 8] );
1944+    vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[0 + 3 * 8] );
1945+    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[4 + 3 * 8] );
1946+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
1947+    return satd;
1948+}
1949+
1950+int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
1951+{
1952+    private pixel pred[32];
1953+    int satd;
1954+#define PRED( x, y ) pred[(x) + (y)*8]
1955+    // Upper half of pred[]
1956+    PRED( 0, 0 ) = F1( top[0], top[1] );
1957+    PRED( 0, 1 ) = F2( top[0], top[1], top[2] );
1958+    PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[1], top[2] );
1959+    PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[1], top[2], top[3] );
1960+    PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[2], top[3] );
1961+    PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[2], top[3], top[4] );
1962+    PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[3], top[4] );
1963+    PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[3], top[4], top[5] );
1964+    PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[4], top[5] );
1965+    PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[4], top[5], top[6] );
1966+    PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[5], top[6] );
1967+    PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[5], top[6], top[7] );
1968+    PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[6], top[7] );
1969+    PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[6], top[7], top[8] );
1970+    PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[7], top[8] );
1971+    PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[7], top[8], top[9] );
1972+    PRED( 7, 2 ) = F1( top[8], top[9] );
1973+    PRED( 7, 3 ) = F2( top[8], top[9], top[10] );
1974+    satd = satd_8x4_lp( src, src_stride, pred, 8 );
1975+    // Lower half of pred[]
1976+    PRED( 0, 0 ) = F1( top[2], top[3] );
1977+    PRED( 0, 1 ) = F2( top[2], top[3], top[4] );
1978+    PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[3], top[4] );
1979+    PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[3], top[4], top[5] );
1980+    PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[4], top[5] );
1981+    PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[4], top[5], top[6] );
1982+    PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[5], top[6] );
1983+    PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[5], top[6], top[7] );
1984+    PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[6], top[7] );
1985+    PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[6], top[7], top[8] );
1986+    PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[7], top[8] );
1987+    PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[7], top[8], top[9] );
1988+    PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[8], top[9] );
1989+    PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[8], top[9], top[10] );
1990+    PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[9], top[10] );
1991+    PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[9], top[10], top[11] );
1992+    PRED( 7, 2 ) = F1( top[10], top[11] );
1993+    PRED( 7, 3 ) = F2( top[10], top[11], top[12] );
1994+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
1995+    return satd;
1996+#undef PRED
1997+}
1998+
1999+int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
2000+{
2001+    private pixel pred[32];
2002+    int satd;
2003+    int p1 = pack8to16( (F1( left[0], left[1] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
2004+    int p2 = pack8to16( (F1( left[1], left[2] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
2005+    int p3 = pack8to16( (F1( left[2], left[3] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
2006+    int p4 = pack8to16( (F1( left[3], left[4] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
2007+    int p5 = pack8to16( (F1( left[4], left[5] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
2008+    int p6 = pack8to16( (F1( left[5], left[6] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
2009+    int p7 = pack8to16( (F1( left[6], left[7] )), ((left[6] + 2 * left[7] + left[7] + 2) >> 2) );
2010+    int p8 = pack8to16( left[7], left[7] );
2011+    // Upper half of pred[]
2012+    vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
2013+    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
2014+    vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
2015+    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
2016+    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
2017+    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
2018+    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
2019+    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
2020+    satd = satd_8x4_lp( src, src_stride, pred, 8 );
2021+    // Lower half of pred[]
2022+    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
2023+    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
2024+    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
2025+    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
2026+    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
2027+    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
2028+    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
2029+    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
2030+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
2031+    return satd;
2032+}
2033+
2034+int x264_predict_8x8c_h( const local pixel *src, int src_stride )
2035+{
2036+    private pixel pred[32];
2037+    const local pixel *src_l = src;
2038+
2039+    // Upper half of pred[]
2040+    vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
2041+    vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
2042+    vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
2043+    vstore8( (uchar8)(src[-1]), 3, pred ); src += src_stride;
2044+    int satd = satd_8x4_lp( src_l, src_stride, pred, 8 );
2045+
2046+    // Lower half of pred[]
2047+    vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
2048+    vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
2049+    vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
2050+    vstore8( (uchar8)(src[-1]), 3, pred );
2051+    return satd + satd_8x4_lp( src_l + ( src_stride << 2 ), src_stride, pred, 8 );
2052+}
2053+
2054+int x264_predict_8x8c_v( const local pixel *src, int src_stride )
2055+{
2056+    private pixel pred[32];
2057+    uchar16 v16;
2058+    v16.lo = vload8( 0, &src[-src_stride] );
2059+    v16.hi = vload8( 0, &src[-src_stride] );
2060+
2061+    vstore16( v16, 0, pred );
2062+    vstore16( v16, 1, pred );
2063+
2064+    return satd_8x4_lp( src, src_stride, pred, 8 ) +
2065+           satd_8x4_lp( src + (src_stride << 2), src_stride, pred, 8 );
2066+}
2067+
2068+int x264_predict_8x8c_p( const local pixel *src, int src_stride )
2069+{
2070+    int H = 0, V = 0;
2071+    private pixel pred[32];
2072+    int satd;
2073+
2074+    for( int i = 0; i < 4; i++ )
2075+    {
2076+        H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
2077+        V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
2078+    }
2079+
2080+    int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
2081+    int b = (17 * H + 16) >> 5;
2082+    int c = (17 * V + 16) >> 5;
2083+    int i00 = a - 3 * b - 3 * c + 16;
2084+
2085+    // Upper half of pred[]
2086+    for( int y = 0; y < 4; y++ )
2087+    {
2088+        int pix = i00;
2089+        for( int x = 0; x < 8; x++ )
2090+        {
2091+            pred[x + y*8] = x264_clip_pixel( pix >> 5 );
2092+            pix += b;
2093+        }
2094+        i00 += c;
2095+    }
2096+    satd = satd_8x4_lp( src, src_stride, pred, 8 );
2097+    // Lower half of pred[]
2098+    for( int y = 0; y < 4; y++ )
2099+    {
2100+        int pix = i00;
2101+        for( int x = 0; x < 8; x++ )
2102+        {
2103+            pred[x + y*8] = x264_clip_pixel( pix >> 5 );
2104+            pix += b;
2105+        }
2106+        i00 += c;
2107+    }
2108+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
2109+    return satd;
2110+}
2111+
2112+int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
2113+{
2114+    private pixel pred[32];
2115+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
2116+    for( int i = 0; i < 4; i++ )
2117+    {
2118+        s0 += src[i - src_stride];
2119+        s1 += src[i + 4 - src_stride];
2120+        s2 += src[-1 + i * src_stride];
2121+        s3 += src[-1 + (i+4)*src_stride];
2122+    }
2123+
2124+    // Upper half of pred[]
2125+    uchar8 dc0;
2126+    dc0.lo = (uchar4)( (s0 + s2 + 4) >> 3 );
2127+    dc0.hi = (uchar4)( (s1 + 2) >> 2 );
2128+    vstore8( dc0, 0, pred );
2129+    vstore8( dc0, 1, pred );
2130+    vstore8( dc0, 2, pred );
2131+    vstore8( dc0, 3, pred );
2132+    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
2133+
2134+    // Lower half of pred[]
2135+    dc0.lo = (uchar4)( (s3 + 2) >> 2 );
2136+    dc0.hi = (uchar4)( (s1 + s3 + 4) >> 3 );
2137+    vstore8( dc0, 0, pred );
2138+    vstore8( dc0, 1, pred );
2139+    vstore8( dc0, 2, pred );
2140+    vstore8( dc0, 3, pred );
2141+    return satd + satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
2142+}
2143+#endif
2144+
2145+/* Find the least cost intra mode for 32 8x8 macroblocks per workgroup
2146+ *
2147+ * Loads 33 macroblocks plus the pixels directly above them into local memory,
2148+ * padding where necessary with edge pixels.  It then cooperatively calculates
2149+ * smoothed top and left pixels for use in some of the analysis.
2150+ *
2151+ * Then groups of 32 threads each calculate a single intra mode for each 8x8
2152+ * block.  Since consecutive threads are calculating the same intra mode there
2153+ * is no code-path divergence.  8 intra costs are calculated simultaneously.  If
2154+ * the "slow" argument is not zero, the final two (least likely) intra modes are
2155+ * tested in a second pass.  The slow mode is only enabled for presets slow,
2156+ * slower, and placebo.
2157+ *
2158+ * Local dimension:    [ 32 * 8, 8 ]
2159+ * Global dimensions:  [ paddedWidth, height ]
2160+ */
2161+
2162+kernel void mb_intra_cost_satd_8x8(
2163+    read_only image2d_t  fenc,
2164+    global uint16_t     *fenc_intra_cost,
2165+    global int          *frame_stats,
2166+    int                  lambda,
2167+    int                  mb_width,
2168+    int                  slow )
2169+{
2170+#define CACHE_STRIDE (265)
2171+#define BLOCK_OFFSET (266)
2172+    local pixel cache[2385];
2173+    local int cost_buf[32];
2174+    local pixel top[32 * 16];
2175+    local pixel left[32 * 8];
2176+    local pixel left_top[32];
2177+
2178+    int lx = get_local_id( 0 );
2179+    int ly = get_local_id( 1 );
2180+    int gx = get_global_id( 0 );
2181+    int gy = get_global_id( 1 );
2182+    int gidx = get_group_id( 0 );
2183+    int gidy = get_group_id( 1 );
2184+    int linear_id = ly * get_local_size( 0 ) + lx;
2185+    int satd = COST_MAX;
2186+    int basex = gidx << 8;
2187+    int basey = (gidy << 3) - 1;
2188+
2189+    for( int y = 0; y < 9; y++ ) //Maybe unrolled later?
2190+    {
2191+        for( int x = 4*linear_id; x < (33 * 8); x += 1024 )
2192+        {
2193+            uint4 data = read_imageui( fenc, sampler, (int2)(x + basex, y + basey) );
2194+            cache[y * CACHE_STRIDE + 1 + x] = data.s0;
2195+            cache[y * CACHE_STRIDE + 1 + x + 1] = data.s1;
2196+            cache[y * CACHE_STRIDE + 1 + x + 2] = data.s2;
2197+            cache[y * CACHE_STRIDE + 1 + x + 3] = data.s3;
2198+        }
2199+    }
2200+    if( linear_id < 9 )
2201+        cache[linear_id * CACHE_STRIDE] = read_imageui( fenc, sampler, (int2)( basex - 1, linear_id + basey) ).s0;
2202+
2203+    barrier( CLK_LOCAL_MEM_FENCE );
2204+
2205+    // Cooperatively build the top edge for the macroblock using smoothing function
2206+    int j = ly;
2207+    top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp( j - 1, -1, 15 )] +
2208+                       2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp( j, 0, 15 )] +
2209+                       cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp( j + 1, 0, 15 )] + 2 ) >> 2;
2210+    j += 8;
2211+    top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp( j - 1, -1, 15 )] +
2212+                       2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp( j, 0, 15 )] +
2213+                       cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp( j + 1, 0, 15 )] + 2 ) >> 2;
2214+    // Cooperatively build the left edge for the macroblock using smoothing function
2215+    left[lx*8 + ly] = ( cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*(ly - 1)] +
2216+                        2*cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*ly] +
2217+                        cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*clamp((ly + 1), 0, 7 )] + 2 ) >> 2;
2218+    // One left_top per macroblock
2219+    if( 0 == ly )
2220+    {
2221+        left_top[lx] = ( cache[BLOCK_OFFSET + 8*lx - 1] + 2*cache[BLOCK_OFFSET + 8*lx - 1 - CACHE_STRIDE] +
2222+                         cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE] + 2 ) >> 2;
2223+        cost_buf[lx] = COST_MAX;
2224+    }
2225+    barrier( CLK_LOCAL_MEM_FENCE );
2226+
2227+    // each warp/wavefront generates a different prediction type; no divergence
2228+    switch( ly )
2229+    {
2230+        case 0:
2231+            satd = x264_predict_8x8c_h( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
2232+            break;
2233+        case 1:
2234+            satd = x264_predict_8x8c_v( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
2235+            break;
2236+        case 2:
2237+            satd = x264_predict_8x8c_dc( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
2238+            break;
2239+        case 3:
2240+            satd = x264_predict_8x8c_p( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
2241+            break;
2242+        case 4:
2243+            satd = x264_predict_8x8_ddr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
2244+            break;
2245+        case 5:
2246+            satd = x264_predict_8x8_vr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
2247+            break;
2248+        case 6:
2249+            satd = x264_predict_8x8_hd( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
2250+            break;
2251+        case 7:
2252+            satd = x264_predict_8x8_hu( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &left[8*lx] );
2253+            break;
2254+        default:
2255+            break;
2256+    }
2257+    atom_min( &cost_buf[lx], satd );
2258+    if( slow )
2259+    {
2260+        // Do the remaining two (least likely) prediction modes
2261+        switch( ly )
2262+        {
2263+            case 0: // DDL
2264+                satd = x264_predict_8x8_ddl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
2265+                atom_min( &cost_buf[lx], satd );
2266+                break;
2267+            case 1: // VL
2268+                satd = x264_predict_8x8_vl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
2269+                atom_min( &cost_buf[lx], satd );
2270+                break;
2271+            default:
2272+                break;
2273+        }
2274+    }
2275+    barrier( CLK_LOCAL_MEM_FENCE );
2276+
2277+    if( (0 == ly) && (gx < mb_width) )
2278+    {
2279+        fenc_intra_cost[gidy * mb_width + gx] = cost_buf[lx]+ 5*lambda;
2280+    }
2281+
2282+    // initialize the frame_stats[2] buffer for kernel sum_intra_cost().
2283+    if( gx < 2 && gy == 0 )
2284+        frame_stats[gx] = 0;
2285+#undef CACHE_STRIDE
2286+#undef BLOCK_OFFSET
2287+}
2288+
2289+
2290+
2291+/*
2292+ * parallel sum intra costs
2293+ *
2294+ * global launch dimensions: [256, mb_height]
2295+ */
2296+kernel void sum_intra_cost(
2297+    const global uint16_t *fenc_intra_cost,
2298+    const global uint16_t *inv_qscale_factor,
2299+    global int           *fenc_row_satds,
2300+    global int           *frame_stats,
2301+    int                   mb_width )
2302+{
2303+    int y = get_global_id( 1 );
2304+    int mb_height = get_global_size( 1 );
2305+
2306+    int row_satds = 0;
2307+    int cost_est = 0;
2308+    int cost_est_aq = 0;
2309+
2310+    for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
2311+    {
2312+        int mb_xy = x + y * mb_width;
2313+        int cost = fenc_intra_cost[mb_xy];
2314+        int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
2315+        int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
2316+
2317+        row_satds += cost_aq;
2318+        if( b_frame_score_mb )
2319+        {
2320+            cost_est += cost;
2321+            cost_est_aq += cost_aq;
2322+        }
2323+    }
2324+
2325+    local int buffer[256];
2326+    int x = get_global_id( 0 );
2327+
2328+    row_satds   = parallel_sum( row_satds, x, buffer );
2329+    cost_est    = parallel_sum( cost_est, x, buffer );
2330+    cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
2331+
2332+    if( get_global_id( 0 ) == 0 )
2333+    {
2334+        fenc_row_satds[y] = row_satds;
2335+        atomic_add( frame_stats + COST_EST,    cost_est );
2336+        atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
2337+    }
2338+}
2339diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl/motionsearch.cl
2340--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
2341+++ b/common/opencl/motionsearch.cl	Mon Aug 20 22:33:23 2012 -0500
2342@@ -0,0 +1,275 @@
2343+/* Hierarchical (iterative) OpenCL lowres motion search */
2344+
2345+inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
2346+{
2347+    /* edge macroblocks might not have a direct descendant, use nearest */
2348+    x = (x == mb_width-1)  ? (x - (mb_width&1)) >> 1 : x >> 1;
2349+    y = (y == mb_height-1) ? (y - (mb_height&1)) >> 1 : y >> 1;
2350+    return (mb_width>>1) * y + x;
2351+}
2352+
2353+/* Four threads calculate an 8x8 SAD.  Each does two rows */
2354+int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
2355+{
2356+    frefpos.y += idx << 1;
2357+    fencpos.y += idx << 1;
2358+    int cost = 0;
2359+    if( frefpos.x < 0 )
2360+    {
2361+        /* slow path when MV goes past right edge */
2362+        for( int y = 0; y < 2; y++ )
2363+        {
2364+            for( int x = 0; x < 8; x++ )
2365+            {
2366+                pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
2367+                pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
2368+                cost += abs_diff( enc, ref );
2369+            }
2370+        }
2371+    }
2372+    else
2373+    {
2374+        uint4 enc, ref, costs = 0;
2375+        enc = read_imageui( fenc, sampler, fencpos );
2376+        ref = read_imageui( fref, sampler, frefpos );
2377+        costs += abs_diff( enc, ref );
2378+        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
2379+        ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
2380+        costs += abs_diff( enc, ref );
2381+        enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
2382+        ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
2383+        costs += abs_diff( enc, ref );
2384+        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
2385+        ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
2386+        costs += abs_diff( enc, ref );
2387+        cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
2388+    }
2389+    costs[idx] = cost;
2390+    return costs[0] + costs[1] + costs[2] + costs[3];
2391+}
2392+
2393+/* One thread performs 8x8 SAD */
2394+int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
2395+{
2396+    if( frefpos.x < 0 )
2397+    {
2398+        /* slow path when MV goes past right edge */
2399+        int cost = 0;
2400+        for( int y = 0; y < 8; y++ )
2401+        {
2402+            for( int x = 0; x < 8; x++ )
2403+            {
2404+                uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
2405+                uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
2406+                cost += abs_diff( enc, ref );
2407+            }
2408+        }
2409+        return cost;
2410+    }
2411+    else
2412+    {
2413+        uint4 enc, ref, cost = 0;
2414+        for( int y = 0; y < 8; y++ )
2415+        {
2416+            for( int x = 0; x < 8; x += 4 )
2417+            {
2418+                enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
2419+                ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
2420+                cost += abs_diff( enc, ref );
2421+            }
2422+        }
2423+        return cost.s0 + cost.s1 + cost.s2 + cost.s3;
2424+    }
2425+}
2426+
2427+
2428+/*
2429+ * hierarchical motion estimation
2430+ *
2431+ * Each kernel launch is a single iteration
2432+ *
2433+ * MB per work group is determined by lclx / 4 * lcly
2434+ *
2435+ * global launch dimensions:  [mb_width * 4, mb_height]
2436+ */
2437+kernel void hierarchical_motion(
2438+    read_only image2d_t  fenc,
2439+    read_only image2d_t  fref,
2440+    const global short2 *in_mvs,
2441+    global short2       *out_mvs,
2442+    global int16_t      *out_mv_costs,
2443+    global short2       *mvp_buffer,
2444+    local int16_t       *cost_local,
2445+    local short2        *mvc_local,
2446+    int                  mb_width,
2447+    int                  lambda,
2448+    int                  me_range,
2449+    int                  scale,
2450+    int                  b_shift_index,
2451+    int                  b_first_iteration,
2452+    int                  b_reverse_references )
2453+{
2454+    int mb_x = get_global_id( 0 ) >> 2;
2455+    if( mb_x >= mb_width )
2456+        return;
2457+    int mb_height = get_global_size( 1 );
2458+    int mb_i = get_global_id( 0 ) & 3;
2459+    int mb_y = get_global_id( 1 );
2460+    int mb_xy = mb_y * mb_width + mb_x;
2461+    const int mb_size = 8;
2462+    int2 coord;
2463+    coord.x = mb_x * mb_size;
2464+    coord.y = mb_y * mb_size;
2465+
2466+    const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
2467+    cost_local += 4 * mb_in_group;
2468+
2469+    int i_mvc = 0;
2470+    mvc_local += 4 * mb_in_group;
2471+    mvc_local[mb_i] = 0;
2472+    short2 mvp;
2473+
2474+    if( b_first_iteration )
2475+    {
2476+        mvp.x = 0;
2477+        mvp.y = 0;
2478+    }
2479+    else
2480+    {
2481+#define MVC( DX, DY )\
2482+    {\
2483+        int px = mb_x + DX;\
2484+        int py = mb_y + DY;\
2485+        if( b_shift_index )\
2486+            mvc_local[i_mvc] = in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )];\
2487+        else\
2488+            mvc_local[i_mvc] = in_mvs[mb_width * py + px];\
2489+        mvc_local[i_mvc].x >>= scale;\
2490+        mvc_local[i_mvc].y >>= scale;\
2491+        i_mvc++;\
2492+    }
2493+        /* Find MVP from median of MVCs */
2494+        if( b_reverse_references )
2495+        {
2496+            /* odd iterations: derive MVP from down and right */
2497+            if( mb_x < mb_width - 1 )
2498+                MVC( 1, 0 );
2499+            if( mb_y < mb_height - 1 )
2500+            {
2501+                MVC( 0, 1 );
2502+                if( mb_x > b_shift_index )
2503+                    MVC( -1, 1 );
2504+                if( mb_x < mb_width - 1 )
2505+                    MVC( 1, 1 );
2506+            }
2507+        }
2508+        else
2509+        {
2510+            /* even iterations: derive MVP from up and left */
2511+            if( mb_x > 0 )
2512+                MVC( -1, 0 );
2513+            if( mb_y > 0 )
2514+            {
2515+                MVC( 0, -1 );
2516+                if( mb_x < mb_width - 1 )
2517+                    MVC( 1, -1 );
2518+                if( mb_x > b_shift_index )
2519+                    MVC( -1, -1 );
2520+            }
2521+        }
2522+        if( i_mvc <= 1 )
2523+        {
2524+            mvp = mvc_local[0];
2525+        }
2526+        else
2527+            mvp = x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
2528+#undef MVC
2529+    }
2530+    //new mvp == old mvp, copy the input mv to the output mv and exit.
2531+    if( (!b_shift_index) && (mvp.x == mvp_buffer[mb_xy].x) && (mvp.y == mvp_buffer[mb_xy].y) )
2532+    {
2533+        out_mvs[mb_xy] = in_mvs[mb_xy];
2534+        return;
2535+    }
2536+    mvp_buffer[mb_xy] = mvp;
2537+    short2 mv_min;
2538+    short2 mv_max;
2539+    mv_min.x = -mb_size * mb_x - 4;
2540+    mv_max.x = mb_size * (mb_width - mb_x - 1) + 4;
2541+    mv_min.y = -mb_size * mb_y - 4;
2542+    mv_max.y = mb_size * (mb_height - mb_y - 1) + 4;
2543+
2544+    short2 bestmv;
2545+    bestmv.x = x264_clip3( mvp.x, mv_min.x, mv_max.x );
2546+    bestmv.y = x264_clip3( mvp.y, mv_min.y, mv_max.y );
2547+
2548+    int2 refcrd;
2549+    refcrd.x = coord.x + bestmv.x;
2550+    refcrd.y = coord.y + bestmv.y;
2551+    /* measure cost at bestmv */
2552+    int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
2553+                lambda * calc_mv_cost( abs_diff( bestmv.x, mvp.x ) << (2 + scale), abs_diff( bestmv.y, mvp.y ) << (2 + scale) );
2554+
2555+    do
2556+    {
2557+        /* measure costs at offsets from bestmv */
2558+        refcrd.x = coord.x + bestmv.x + dia_offs[mb_i].x;
2559+        refcrd.y = coord.y + bestmv.y + dia_offs[mb_i].y;
2560+        short2 trymv = bestmv + dia_offs[mb_i];
2561+        int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
2562+                   lambda * calc_mv_cost( abs_diff( trymv.x, mvp.x ) << (2 + scale), abs_diff( trymv.y, mvp.y ) << (2 + scale) );
2563+
2564+        cost_local[mb_i] = (cost<<2) | mb_i;
2565+        cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );
2566+
2567+        if( (cost >> 2) >= bcost )
2568+            break;
2569+
2570+        bestmv += dia_offs[cost&3];
2571+        bcost = cost>>2;
2572+
2573+        if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y )
2574+            break;
2575+    }
2576+    while( --me_range > 0 );
2577+
2578+    short2 trymv;
2579+
2580+#define COST_MV_NO_PAD( X, Y, L )\
2581+    trymv.x = x264_clip3( X, mv_min.x, mv_max.x );\
2582+    trymv.y = x264_clip3( Y, mv_min.y, mv_max.y );\
2583+    if( abs_diff( mvp.x, trymv.x ) > 1 || abs_diff( mvp.y, trymv.y ) > 1 ) {\
2584+        int2 refcrd = coord; refcrd.x += trymv.x; refcrd.y += trymv.y;\
2585+        int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\
2586+                   L * calc_mv_cost( abs_diff( trymv.x, mvp.x ) << (2 + scale), abs_diff( trymv.y, mvp.y ) << (2 + scale) );\
2587+        if( cost < bcost ) { bcost = cost; bestmv = trymv; } }
2588+
2589+    COST_MV_NO_PAD( 0, 0, 0 );
2590+
2591+    if( !b_first_iteration )
2592+    {
2593+        /* try cost at previous iteration's MV, if MVP was too far away */
2594+        short2 prevmv;
2595+        if( b_shift_index )
2596+            prevmv = in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )];
2597+        else
2598+            prevmv = in_mvs[mb_xy];
2599+        prevmv.x >>= scale;
2600+        prevmv.y >>= scale;
2601+        COST_MV_NO_PAD( prevmv.x, prevmv.y, lambda );
2602+    }
2603+
2604+    for( int i = 0; i < i_mvc; i++ )
2605+    {
2606+        /* try cost at each candidate MV, if MVP was too far away */
2607+        COST_MV_NO_PAD( mvc_local[i].x, mvc_local[i].y, lambda );
2608+    }
2609+
2610+    if( mb_i == 0 )
2611+    {
2612+        bestmv.x <<= scale;
2613+        bestmv.y <<= scale;
2614+        out_mvs[mb_xy] = bestmv;
2615+        out_mv_costs[mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK );
2616+    }
2617+}
2618diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl/subpel.cl
2619--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
2620+++ b/common/opencl/subpel.cl	Mon Aug 20 22:33:23 2012 -0500
2621@@ -0,0 +1,288 @@
2622+/* OpenCL lowres subpel Refine */
2623+
2624+/* One thread performs 8x8 SAD */
2625+int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
2626+{
2627+    int2 frefpos = (int2)(qpos.x >> 2, qpos.y >> 2);
2628+    int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2);
2629+    uint mask_shift = 8 * hpel_idx;
2630+
2631+    uint4 cost4 = 0;
2632+
2633+    for( int y = 0; y < 8; y++ )
2634+    {
2635+        uint4 enc, val4;
2636+        enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y));
2637+        val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 >> mask_shift) & 0xFF;
2638+        val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 >> mask_shift) & 0xFF;
2639+        val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 >> mask_shift) & 0xFF;
2640+        val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 >> mask_shift) & 0xFF;
2641+        cost4 += abs_diff( enc, val4 );
2642+
2643+        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y));
2644+        val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 >> mask_shift) & 0xFF;
2645+        val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 >> mask_shift) & 0xFF;
2646+        val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 >> mask_shift) & 0xFF;
2647+        val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 >> mask_shift) & 0xFF;
2648+        cost4 += abs_diff( enc, val4 );
2649+    }
2650+
2651+    return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3;
2652+}
2653+
2654+/* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */
2655+int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
2656+{
2657+    int2 frefApos = qpos >> 2;
2658+    int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2);
2659+
2660+    int2 qposB = qpos + ((qpos & 1) << 1);
2661+    int2 frefBpos = qposB >> 2;
2662+    int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2);
2663+
2664+    uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
2665+
2666+    int cost = 0;
2667+
2668+    for( int y = 0; y < 8; y++ )
2669+    {
2670+        for( int x = 0; x < 8; x++ )
2671+        {
2672+            uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0;
2673+            uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 >> mask_shift0) & 0xFF;
2674+            uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 >> mask_shift1) & 0xFF;
2675+            uint ref = rhadd( vA, vB );
2676+            cost += abs_diff( enc, ref );
2677+        }
2678+    }
2679+
2680+    return cost;
2681+}
2682+
2683+/* Four threads measure 8x8 SATD cost at a QPEL offset into an HPEL plane
2684+ *
2685+ * Each thread collects 1/4 of the rows of diffs and processes one quarter of
2686+ * the transforms
2687+ */
2688+int satd_8x8_ii_qpel_coop4(
2689+    read_only image2d_t fenc, int2 fencpos,
2690+    read_only image2d_t fref_planes, int2 qpos,
2691+    local sum2_t *tmpp,
2692+    int idx )
2693+{
2694+    volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
2695+    sum2_t b0, b1, b2, b3;
2696+
2697+    // fencpos is full-pel position of original MB
2698+    // qpos is qpel position within reference frame
2699+    int2 frefApos = (int2)(qpos.x>>2, qpos.y>>2);
2700+    int hpelA = ((qpos.x&2)>>1) + (qpos.y&2);
2701+
2702+    int2 qposB = (int2)qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1));
2703+    int2 frefBpos = (int2)(qposB.x>>2, qposB.y>>2);
2704+    int hpelB = ((qposB.x&2)>>1) + (qposB.y&2);
2705+
2706+    uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
2707+
2708+    uint vA, vB;
2709+    uint a0, a1;
2710+    uint enc;
2711+    sum2_t sum = 0;
2712+
2713+
2714+#define READ_DIFF( OUT, X )\
2715+    enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\
2716+    vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 >> mask_shift0) & 0xFF;\
2717+    vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 >> mask_shift1) & 0xFF;\
2718+    OUT = enc - rhadd( vA, vB );
2719+
2720+#define READ_DIFF_EX( OUT, a, b )\
2721+    {\
2722+        READ_DIFF( a0, a );\
2723+        READ_DIFF( a1, b );\
2724+        OUT = a0 + (a1<<BITS_PER_SUM);\
2725+    }
2726+#define ROW_8x4_SATD( a, b )\
2727+    {\
2728+        fencpos.y += a;\
2729+        frefApos.y += b;\
2730+        frefBpos.y += b;\
2731+        READ_DIFF_EX( b0, 0, 4 );\
2732+        READ_DIFF_EX( b1, 1, 5 );\
2733+        READ_DIFF_EX( b2, 2, 6 );\
2734+        READ_DIFF_EX( b3, 3, 7 );\
2735+        HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
2736+        HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
2737+        sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\
2738+    }
2739+    ROW_8x4_SATD( 0, 0 );
2740+    ROW_8x4_SATD( 4, 4 );
2741+
2742+#undef READ_DIFF
2743+#undef READ_DIFF_EX
2744+#undef ROW_8x4_SATD
2745+    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
2746+}
2747+
2748+constant short2 hpoffs[4] =
2749+{
2750+    {0, -2}, {-2, 0}, {2, 0}, {0, 2}
2751+};
2752+
2753+/*
2754+ * sub pixel refinement of motion vectors, output MVs and costs are moved from
2755+ * temporary buffers into final per-frame buffer
2756+ *
2757+ * global launch dimensions:  [mb_width * 4, mb_height]
2758+ *
2759+ * With X being the source 16x16 pixels, F is the lowres pixel used by the
2760+ * motion search.  We will now utilize the H V and C pixels (stored in separate
2761+ * planes) to search at half-pel increments.
2762+ *
2763+ * X X X X X X
2764+ *  F H F H F
2765+ * X X X X X X
2766+ *  V C V C V
2767+ * X X X X X X
2768+ *  F H F H F
2769+ * X X X X X X
2770+ *
2771+ * The YX HPEL bits of the motion vector selects the plane we search in.  The
2772+ * four planes are packed in the fref_planes 2D image buffer.  Each sample
2773+ * returns:  s0 = F, s1 = H, s2 = V, s3 = C
2774+ */
2775+kernel void subpel_refine(
2776+    read_only image2d_t   fenc,
2777+    read_only image2d_t   fref_planes,
2778+    const global short2  *in_mvs,
2779+    const global int16_t *in_sad_mv_costs,
2780+    local int16_t        *cost_local,
2781+    local sum2_t         *satd_local,
2782+    local short2         *mvc_local,
2783+    global short2        *fenc_lowres_mv,
2784+    global int16_t       *fenc_lowres_mv_costs,
2785+    int                   mb_width,
2786+    int                   lambda,
2787+    int                   b,
2788+    int                   ref,
2789+    int                   b_islist1 )
2790+{
2791+    int mb_x = get_global_id( 0 ) >> 2;
2792+    if( mb_x >= mb_width )
2793+        return;
2794+    int mb_height = get_global_size( 1 );
2795+
2796+    int mb_i = get_global_id( 0 ) & 3;
2797+    int mb_y = get_global_id( 1 );
2798+    int mb_xy = mb_y * mb_width + mb_x;
2799+
2800+    /* fenc_lowres_mv and fenc_lowres_mv_costs are large buffers that
2801+     * hold many frames worth of motion vectors.  We must offset into the correct
2802+     * location for this frame's vectors.  The kernel will be passed the correct
2803+     * directional buffer for the direction of the search: list1 or list0
2804+     *
2805+     *   CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
2806+     *   GPU equivalent: fenc_lowres_mvs[(b - p0 - 1) * mb_count]
2807+     */
2808+    fenc_lowres_mv += ( b_islist1 ? ( ref - b - 1 ) : ( b - ref - 1 ) ) * mb_width * mb_height;
2809+    fenc_lowres_mv_costs += ( b_islist1 ? ( ref - b - 1 ) : ( b - ref - 1 ) ) * mb_width * mb_height;
2810+
2811+    int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
2812+    cost_local += mb_in_group * 4;
2813+    satd_local += mb_in_group * 16;
2814+    mvc_local += mb_in_group * 4;
2815+
2816+    int i_mvc = 0;
2817+    mvc_local[0] = 0;
2818+    mvc_local[1] = 0;
2819+    mvc_local[2] = 0;
2820+    mvc_local[3] = 0;
2821+    short2 mvp;
2822+#define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)];
2823+    if( mb_x > 0 )
2824+        MVC( -1, 0 );
2825+    if( mb_y > 0 )
2826+    {
2827+        MVC( 0, -1 );
2828+        if( mb_x < mb_width - 1 )
2829+            MVC( 1, -1 );
2830+        if( mb_x > 0 )
2831+            MVC( -1, -1 );
2832+    }
2833+    if( i_mvc <= 1 )
2834+        mvp = mvc_local[0];
2835+    else
2836+        mvp = x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
2837+#undef MVC
2838+
2839+    int bcost =  in_sad_mv_costs[mb_xy];
2840+    short2 bmv = in_mvs[mb_xy];
2841+
2842+    /* Make mvp a QPEL MV */
2843+    mvp.x <<= 2;
2844+    mvp.y <<= 2;
2845+
2846+    /* Make bmv a QPEL MV */
2847+    bmv.x <<= 2;
2848+    bmv.y <<= 2;
2849+
2850+    int2 coord;
2851+    coord.x = mb_x << 3;
2852+    coord.y = mb_y << 3;
2853+    short2 trymv;
2854+    int2 qpos;
2855+    int cost = 0;
2856+
2857+#define HPEL_QPEL( ARR, FUNC )\
2858+    {\
2859+        trymv = bmv + ARR[mb_i];\
2860+        qpos.x = (coord.x << 2) + trymv.x;\
2861+        qpos.y = (coord.y << 2) + trymv.y;\
2862+        cost = FUNC( fenc, coord, fref_planes, qpos ) + lambda * calc_mv_cost( abs_diff( trymv.x, mvp.x ), abs_diff( trymv.y, mvp.y ) );\
2863+        cost_local[mb_i] = (cost<<2) + mb_i;\
2864+        cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );\
2865+        if( (cost>>2) < bcost )\
2866+        {\
2867+            bmv += ARR[cost&3];\
2868+            bcost = cost>>2;\
2869+        }\
2870+    }
2871+
2872+    HPEL_QPEL( hpoffs, sad_8x8_ii_hpel );
2873+
2874+#define CHEAP_QPEL 1
2875+
2876+#if CHEAP_QPEL
2877+    HPEL_QPEL( dia_offs, sad_8x8_ii_qpel );
2878+#endif
2879+
2880+    /* remeasure with SATD */
2881+    qpos.x = (coord.x << 2) + bmv.x;
2882+    qpos.y = (coord.y << 2) + bmv.y;
2883+    cost_local[mb_i] = satd_8x8_ii_qpel_coop4( fenc, coord, fref_planes, qpos, satd_local, mb_i );
2884+    bcost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];
2885+    bcost += lambda * calc_mv_cost( abs_diff( bmv.x, mvp.x ), abs_diff( bmv.y, mvp.y ) );
2886+    short2 bestmv = 0;
2887+
2888+#if !CHEAP_QPEL
2889+#define QPEL_SATD( ARR, FUNC )\
2890+    {\
2891+        trymv = bmv + ARR;\
2892+        qpos.x = (coord.x << 2) + trymv.x;\
2893+        qpos.y = (coord.y << 2) + trymv.y;\
2894+        cost_local[mb_i] = FUNC( fenc, coord, fref_planes, qpos, satd_local, mb_i );\
2895+        cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
2896+        cost += lambda * calc_mv_cost( abs_diff( trymv.x, mvp.x ), abs_diff( trymv.y, mvp.y ) );\
2897+        if( cost < bcost )\
2898+        {\
2899+            bestmv = ARR;\
2900+            bcost = cost;\
2901+        }\
2902+    }
2903+    for( int i = 0; i<4; i++ )
2904+        QPEL_SATD( dia_offs[i], satd_8x8_ii_qpel_coop4 );
2905+#endif
2906+
2907+    fenc_lowres_mv[mb_xy] = bmv+bestmv;
2908+    fenc_lowres_mv_costs[mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK );
2909+}
2910diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl/weightp.cl
2911--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
2912+++ b/common/opencl/weightp.cl	Mon Aug 20 22:33:23 2012 -0500
2913@@ -0,0 +1,48 @@
2914+/* Weightp filter a downscaled image into a temporary output buffer.
2915+ * This kernel is launched for onced for each scale.
2916+ *
2917+ * Launch dimensions: width x height (in pixels)
2918+ */
2919+kernel void weightp_scaled_images( read_only image2d_t in_plane,
2920+                                   write_only image2d_t out_plane,
2921+                                   uint offset,
2922+                                   uint scale,
2923+                                   uint denom )
2924+{
2925+    int gx = get_global_id( 0 );
2926+    int gy = get_global_id( 1 );
2927+    uint4 input_val;
2928+    uint4 output_val;
2929+
2930+    input_val = read_imageui( in_plane, sampler, (int2)(gx, gy));
2931+    output_val = (uint4)(offset) + ( ( ((uint4)(scale)) * input_val ) >> ((uint4)(denom)) );
2932+    write_imageui( out_plane, (int2)(gx, gy), output_val );
2933+}
2934+
2935+/* Weightp filter for the half-pel interpolated image
2936+ *
2937+ * Launch dimensions: width x height (in pixels)
2938+ */
2939+kernel void weightp_hpel( read_only image2d_t in_plane,
2940+                          write_only image2d_t out_plane,
2941+                          uint offset,
2942+                          uint scale,
2943+                          uint denom )
2944+{
2945+    int gx = get_global_id( 0 );
2946+    int gy = get_global_id( 1 );
2947+    uint input_val;
2948+    uint output_val;
2949+
2950+    input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)).s0;
2951+    //Unpack
2952+    uint4 temp;
2953+    temp.s0 = input_val & 0x00ff; temp.s1 = (input_val >> 8) & 0x00ff;
2954+    temp.s2 = (input_val >> 16) & 0x00ff; temp.s3 = (input_val >> 24) & 0x00ff;
2955+
2956+    temp = (uint4)(offset) + ( ( ((uint4)(scale)) * temp ) >> ((uint4)(denom)) );
2957+
2958+    //Pack
2959+    output_val = temp.s0 | (temp.s1 << 8) | (temp.s2 << 16) | (temp.s3 << 24);
2960+    write_imageui( out_plane, (int2)(gx, gy), output_val );
2961+}
2962diff -r bdffc2c1e85b -r e5e4a79d3f21 common/opencl/x264-cl.h
2963--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
2964+++ b/common/opencl/x264-cl.h	Mon Aug 20 22:33:23 2012 -0500
2965@@ -0,0 +1,161 @@
2966+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
2967+
2968+constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
2969+
2970+/* 7.18.1.1  Exact-width integer types */
2971+typedef signed char int8_t;
2972+typedef unsigned char   uint8_t;
2973+typedef short  int16_t;
2974+typedef unsigned short  uint16_t;
2975+typedef int  int32_t;
2976+typedef unsigned   uint32_t;
2977+typedef long  int64_t;
2978+typedef unsigned long uint64_t;
2979+
2980+typedef union
2981+{
2982+    uint32_t i;
2983+    uint16_t b[2];
2984+    uint8_t  c[4];
2985+}  x264_union32_t;
2986+
2987+typedef uint8_t  pixel;
2988+typedef uint16_t sum_t;
2989+typedef uint32_t sum2_t;
2990+
2991+#define LOWRES_COST_MASK ((1<<14)-1)
2992+#define LOWRES_COST_SHIFT 14
2993+#define COST_MAX (1<<28)
2994+
2995+#define X264_MIN( a, b ) ((a)<(b) ? (a) : (b))
2996+
2997+#define PIXEL_MAX 255
2998+#define BITS_PER_SUM (8 * sizeof(sum_t))
2999+
3000+/* Constants for offsets into frame statistics buffer */
3001+#define COST_EST    0
3002+#define COST_EST_AQ 1
3003+#define INTRA_MBS   2
3004+
3005+#define COPY2_IF_LT( x, y, a, b )\
3006+    if((y)<(x))\
3007+    {\
3008+        (x) = (y);\
3009+        (a) = (b);\
3010+    }
3011+constant short2 dia_offs[4] =
3012+{
3013+    {0, -1}, {-1, 0}, {1, 0}, {0, 1},
3014+};
3015+
3016+inline pixel x264_clip_pixel( int x )
3017+{
3018+    return ((x & ~PIXEL_MAX) ? (-x) >> 31 & PIXEL_MAX : x);
3019+}
3020+
3021+inline int x264_clip3( int v, int i_min, int i_max )
3022+{
3023+    return ((v < i_min) ? i_min : (v > i_max) ? i_max : v);
3024+}
3025+
3026+inline int x264_median( int a, int b, int c )
3027+{
3028+    int t = (a - b) & ((a - b) >> 31);
3029+    a -= t;
3030+    b += t;
3031+    b -= (b - c) & ((b - c) >> 31);
3032+    b += (a - b) & ((a - b) >> 31);
3033+    return b;
3034+}
3035+
3036+inline short2 x264_median_mv( short2 a, short2 b, short2 c )
3037+{
3038+    short2 dst;
3039+    dst.x = x264_median( a.x, b.x, c.x );
3040+    dst.y = x264_median( a.y, b.y, c.y );
3041+    return dst;
3042+}
3043+
3044+inline sum2_t abs2( sum2_t a )
3045+{
3046+    sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
3047+    return (a + s) ^ s;
3048+}
3049+
3050+#define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
3051+        sum2_t t0 = s0 + s1;\
3052+        sum2_t t1 = s0 - s1;\
3053+        sum2_t t2 = s2 + s3;\
3054+        sum2_t t3 = s2 - s3;\
3055+        d0 = t0 + t2;\
3056+        d2 = t0 - t2;\
3057+        d1 = t1 + t3;\
3058+        d3 = t1 - t3;\
3059+}
3060+
3061+#define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
3062+        int2 t0 = s0 + s1;\
3063+        int2 t1 = s0 - s1;\
3064+        int2 t2 = s2 + s3;\
3065+        int2 t3 = s2 - s3;\
3066+        d0 = t0 + t2;\
3067+        d2 = t0 - t2;\
3068+        d1 = t1 + t3;\
3069+        d3 = t1 - t3;\
3070+}
3071+
3072+#define SATD_C_8x4_Q( name, q1, q2 )\
3073+    int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\
3074+    {\
3075+        sum2_t tmp[4][4];\
3076+        sum2_t a0, a1, a2, a3;\
3077+        sum2_t sum = 0;\
3078+        for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\
3079+        {\
3080+            a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\
3081+            a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\
3082+            a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\
3083+            a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\
3084+            HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\
3085+        }\
3086+        for( int i = 0; i < 4; i++ )\
3087+        {\
3088+            HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\
3089+            sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\
3090+        }\
3091+        return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\
3092+    }
3093+
3094+/*
3095+ * Utility function to perform a parallel sum reduction of an array of integers
3096+ */
3097+int parallel_sum( int value, int x, volatile local int *array )
3098+{
3099+    array[x] = value;
3100+    barrier( CLK_LOCAL_MEM_FENCE );
3101+
3102+    int dim = get_local_size( 0 );
3103+
3104+    while( dim > 1 )
3105+    {
3106+        dim >>= 1;
3107+
3108+        if( x < dim )
3109+        {
3110+            array[x] += array[x + dim];
3111+        }
3112+
3113+        if( dim > 32 )
3114+        {
3115+            barrier( CLK_LOCAL_MEM_FENCE );
3116+        }
3117+    }
3118+
3119+    return array[0];
3120+}
3121+
3122+int calc_mv_cost( int dx, int dy )
3123+{
3124+    return round( (log2( (float)(dx + 1) ) * 2.0f + 0.718f + !!dx) ) +
3125+           round( (log2( (float)(dy + 1) ) * 2.0f + 0.718f + !!dy) );
3126+}
3127diff -r bdffc2c1e85b -r e5e4a79d3f21 configure
3128--- a/configure	Wed Jul 18 08:33:41 2012 -0700
3129+++ b/configure	Mon Aug 20 22:33:23 2012 -0500
3130@@ -25,6 +25,7 @@
3131   --system-libx264         use system libx264 instead of internal
3132   --enable-shared          build shared library
3133   --enable-static          build static library
3134+  --disable-opencl         disable OpenCL features
3135   --disable-gpl            disable GPL-only features
3136   --disable-thread         disable multithreaded encoding
3137   --enable-win32thread     use win32threads (windows only)
3138@@ -273,6 +274,7 @@
3139 bit_depth="8"
3140 chroma_format="all"
3141 compiler="GNU"
3142+opencl="yes"
3143 
3144 CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)"
3145 LDFLAGS="$LDFLAGS"
3146@@ -381,6 +383,9 @@
3147         --host=*)
3148             host="$optarg"
3149             ;;
3150+        --disable-opencl)
3151+            opencl="no"
3152+            ;;
3153         --cross-prefix=*)
3154             cross_prefix="$optarg"
3155             ;;
3156@@ -1082,6 +1087,37 @@
3157 PROF_USE_LD=$PROF_USE_LD
3158 EOF
3159 
3160+if [ "$opencl" = "yes" ]; then
3161+    log_check "looking for xxd"
3162+    if ! $(xxd -v 2>/dev/null); then
3163+        echo 'OpenCL support requires xxd to compile.'
3164+        echo 'use --disable-opencl to compile without OpenCL' 
3165+        exit 1
3166+    elif [ "$CUDA_PATH" != "" ]; then
3167+        echo 'HAVE_OPENCL=yes' >> config.mak
3168+        echo 'OPENCL_LIB=OpenCL' >> config.mak
3169+        echo 'OPENCL_INC_DIR=$(CUDA_PATH)include' >> config.mak
3170+        if [ "$ARCH" = "X86" ]; then
3171+            echo 'OPENCL_LIB_DIR=$(CUDA_PATH)lib/Win32' >> config.mak
3172+        else
3173+            echo 'OPENCL_LIB_DIR=$(CUDA_PATH)lib/x64' >> config.mak
3174+        fi
3175+        define HAVE_OPENCL
3176+    elif [ "$AMDAPPSDKROOT" != "" ]; then
3177+        echo 'HAVE_OPENCL=yes' >> config.mak
3178+        echo 'OPENCL_LIB=OpenCL' >> config.mak
3179+        echo 'OPENCL_INC_DIR=$(AMDAPPSDKROOT)/include' >> config.mak
3180+        if [ "$ARCH" = "X86" ]; then
3181+            echo 'OPENCL_LIB_DIR=$(AMDAPPSDKROOT)/lib/x86' >> config.mak
3182+        else
3183+            echo 'OPENCL_LIB_DIR=$(AMDAPPSDKROOT)/lib/x86_64' >> config.mak
3184+        fi
3185+        define HAVE_OPENCL
3186+    else
3187+        opencl="no"
3188+    fi
3189+fi
3190+
3191 if [ $compiler = ICL ]; then
3192     echo '%.o: %.c' >> config.mak
3193     echo '	$(CC) $(CFLAGS) -c -Fo$@ $<' >> config.mak
3194@@ -1192,6 +1228,7 @@
3195 visualize:     $vis
3196 bit depth:     $bit_depth
3197 chroma format: $chroma_format
3198+opencl:        $opencl
3199 EOF
3200 
3201 echo >> config.log
3202diff -r bdffc2c1e85b -r e5e4a79d3f21 encoder/encoder.c
3203--- a/encoder/encoder.c	Wed Jul 18 08:33:41 2012 -0700
3204+++ b/encoder/encoder.c	Mon Aug 20 22:33:23 2012 -0500
3205@@ -37,6 +37,10 @@
3206 #include "common/visualize.h"
3207 #endif
3208 
3209+#if HAVE_OPENCL
3210+#include "common/opencl.h"
3211+#endif
3212+
3213 //#define DEBUG_MB_TYPE
3214 
3215 #define bs_write_ue bs_write_ue_big
3216@@ -528,6 +532,20 @@
3217     if( h->i_thread_frames > 1 )
3218         h->param.nalu_process = NULL;
3219 
3220+#if !HAVE_OPENCL
3221+    if( h->param.b_opencl )
3222+    {
3223+        x264_log( h, X264_LOG_WARNING, "not compiled with OpenCL support\n" );
3224+        h->param.b_opencl = 0;
3225+    }
3226+#elif BIT_DEPTH > 8
3227+    if( h->param.b_opencl )
3228+    {
3229+        x264_log( h, X264_LOG_WARNING, "OpenCL lookahead does not support high bit depth, disabling opencl\n" );
3230+        h->param.b_opencl = 0;
3231+    }
3232+#endif
3233+
3234     h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE );
3235     if( h->param.i_keyint_max == 1 )
3236     {
3237@@ -1338,6 +1356,11 @@
3238             goto fail;
3239     }
3240 
3241+#if HAVE_OPENCL
3242+    if( h->param.b_opencl && x264_opencl_init( h ) < 0 )
3243+        h->param.b_opencl = 0;
3244+#endif
3245+
3246     if( x264_lookahead_init( h, i_slicetype_length ) )
3247         goto fail;
3248 
3249@@ -3483,6 +3506,10 @@
3250                    || h->stat.i_mb_count[SLICE_TYPE_P][I_PCM]
3251                    || h->stat.i_mb_count[SLICE_TYPE_B][I_PCM];
3252 
3253+#if HAVE_OPENCL
3254+    x264_opencl_free( h );
3255+#endif
3256+
3257     x264_lookahead_delete( h );
3258 
3259     if( h->param.b_sliced_threads )
3260diff -r bdffc2c1e85b -r e5e4a79d3f21 encoder/slicetype-cl.c
3261--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
3262+++ b/encoder/slicetype-cl.c	Mon Aug 20 22:33:23 2012 -0500
3263@@ -0,0 +1,681 @@
3264+/*****************************************************************************
3265+ * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
3266+ *****************************************************************************
3267+ * Copyright (C) 2012 x264 project
3268+ *
3269+ * Authors: Steve Borho <sborho@multicorewareinc.com>
3270+ *
3271+ * This program is free software; you can redistribute it and/or modify
3272+ * it under the terms of the GNU General Public License as published by
3273+ * the Free Software Foundation; either version 2 of the License, or
3274+ * (at your option) any later version.
3275+ *
3276+ * This program is distributed in the hope that it will be useful,
3277+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
3278+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
3279+ * GNU General Public License for more details.
3280+ *
3281+ * You should have received a copy of the GNU General Public License
3282+ * along with this program; if not, write to the Free Software
3283+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
3284+ *
3285+ * This program is also available under a commercial proprietary license.
3286+ * For more information, contact us at licensing@x264.com.
3287+ *****************************************************************************/
3288+
3289+#include "common/common.h"
3290+#include "macroblock.h"
3291+#include "me.h"
3292+
3293+#if HAVE_OPENCL
3294+
3295+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
3296+
3297+#define OCLCHECK( method, ... )\
3298+    status = method( __VA_ARGS__ ); if( status != CL_SUCCESS ) { h->param.b_opencl = 0;\
3299+        x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status ); return status; }
3300+
3301+void x264_opencl_flush( x264_t *h )
3302+{
3303+    clFinish( h->opencl.queue );
3304+
3305+    /* Finish copies from the GPU by copying from the page-locked buffer to
3306+     * their final destination
3307+     */
3308+    for( int i = 0; i < h->opencl.num_copies; i++ )
3309+        memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
3310+    h->opencl.num_copies = 0;
3311+    h->opencl.pl_occupancy = 0;
3312+}
3313+
3314+static void *x264_opencl_alloc_locked( x264_t *h, int bytes )
3315+{
3316+    if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
3317+        x264_opencl_flush( h );
3318+    assert( bytes < PAGE_LOCKED_BUF_SIZE );
3319+    char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
3320+    h->opencl.pl_occupancy += bytes;
3321+    return ptr;
3322+}
3323+
3324+int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
3325+{
3326+    if( fenc->b_intra_calculated )
3327+        return 0;
3328+    fenc->b_intra_calculated = 1;
3329+
3330+    int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
3331+
3332+#define CREATEBUF( out, flags, size )\
3333+    out = clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
3334+    if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
3335+#define CREATEIMAGE( out, flags, pf, width, height )\
3336+    out = clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
3337+    if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
3338+
3339+    int mb_count = h->mb.i_mb_count;
3340+    cl_int status;
3341+
3342+    if( !h->opencl.lowres_mv_costs )
3343+    {
3344+        /* Allocate shared memory buffers */
3345+        int width = h->mb.i_mb_width * 8 * sizeof(pixel);
3346+        int height = h->mb.i_mb_height * 8 * sizeof(pixel);
3347+
3348+        cl_image_format pixel_format;
3349+        pixel_format.image_channel_order = CL_R;
3350+        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
3351+        CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
3352+
3353+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
3354+        {
3355+            pixel_format.image_channel_order = CL_RGBA;
3356+            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
3357+            CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
3358+            width >>= 1;
3359+            height >>= 1;
3360+        }
3361+
3362+        CREATEBUF( h->opencl.lowres_mv_costs,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
3363+        CREATEBUF( h->opencl.lowres_costs[0],  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
3364+        CREATEBUF( h->opencl.lowres_costs[1],  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
3365+        CREATEBUF( h->opencl.mv_buffers[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
3366+        CREATEBUF( h->opencl.mv_buffers[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
3367+        CREATEBUF( h->opencl.mvp_buffer,    CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
3368+        CREATEBUF( h->opencl.frame_stats[0],   CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
3369+        CREATEBUF( h->opencl.frame_stats[1],   CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
3370+        CREATEBUF( h->opencl.row_satds[0],     CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
3371+        CREATEBUF( h->opencl.row_satds[1],     CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
3372+        CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY, luma_length );
3373+        CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY, luma_length );
3374+    }
3375+
3376+    if( !fenc->opencl.intra_cost )
3377+    {
3378+        /* Allocate per-frame buffers */
3379+        int width = h->mb.i_mb_width * 8 * sizeof(pixel);
3380+        int height = h->mb.i_mb_height * 8 * sizeof(pixel);
3381+
3382+        cl_image_format pixel_format;
3383+        pixel_format.image_channel_order = CL_R;
3384+        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
3385+        CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
3386+
3387+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
3388+        {
3389+            pixel_format.image_channel_order = CL_RGBA;
3390+            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
3391+            CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
3392+            width >>= 1;
3393+            height >>= 1;
3394+        }
3395+        CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY,  mb_count * sizeof(int16_t) );
3396+        CREATEBUF( fenc->opencl.intra_cost,        CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
3397+        CREATEBUF( fenc->opencl.lowres_mvs0,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
3398+        CREATEBUF( fenc->opencl.lowres_mvs1,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
3399+        CREATEBUF( fenc->opencl.lowres_mv_costs0,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
3400+        CREATEBUF( fenc->opencl.lowres_mv_costs1,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
3401+    }
3402+#undef CREATEBUF
3403+
3404+    /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
3405+
3406+    char *locked = x264_opencl_alloc_locked( h, luma_length );
3407+    memcpy( locked, fenc->plane[0], luma_length );
3408+    OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue,  h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
3409+
3410+    size_t gdim[2];
3411+    if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
3412+    {
3413+        int size = h->mb.i_mb_count * sizeof(int16_t);
3414+        locked = x264_opencl_alloc_locked( h, size );
3415+        memcpy( locked, fenc->i_inv_qscale_factor, size );
3416+        OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
3417+    }
3418+    else
3419+    {
3420+        /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
3421+        cl_uint arg = 0;
3422+        int16_t value = 256;
3423+        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
3424+        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
3425+        gdim[0] = h->mb.i_mb_count;
3426+        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
3427+    }
3428+
3429+    int stride = fenc->i_stride[0];
3430+    cl_uint arg = 0;
3431+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
3432+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
3433+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
3434+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
3435+    gdim[0] = 8 * h->mb.i_mb_width;
3436+    gdim[1] = 8 * h->mb.i_mb_height;
3437+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
3438+
3439+    for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
3440+    {
3441+        /* Workaround for AMD Southern Island:
3442+         *
3443+         * Alternate kernel instances.  No perf impact to this, so we do it for
3444+         * all GPUs.  It prevents the same kernel from being enqueued
3445+         * back-to-back, avoiding a dependency calculation bug in the driver.
3446+         */
3447+        cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
3448+
3449+        arg = 0;
3450+        OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
3451+        OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
3452+        gdim[0] >>= 1;
3453+        gdim[1] >>= 1;
3454+        if( gdim[0] < 16 || gdim[1] < 16 )
3455+            break;
3456+        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
3457+    }
3458+
3459+    size_t ldim[2];
3460+    gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
3461+    gdim[1] = 8*h->mb.i_mb_height;
3462+    ldim[0] = 32;
3463+    ldim[1] = 8;
3464+    arg = 0;
3465+
3466+    /* For presets slow, slower, and placebo, check all 10 intra modes that the
3467+     * C lookahead supports.  For faster presets, only check the most frequent 8
3468+     * modes
3469+     */
3470+    int slow = h->param.analyse.i_subpel_refine > 7;
3471+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
3472+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
3473+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
3474+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
3475+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
3476+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
3477+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
3478+
3479+    gdim[0] = 256;
3480+    gdim[1] = h->mb.i_mb_height;
3481+    ldim[0] = 256;
3482+    ldim[1] = 1;
3483+    arg = 0;
3484+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
3485+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
3486+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
3487+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
3488+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
3489+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
3490+
3491+    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
3492+        x264_opencl_flush( h );
3493+
3494+    int size = h->mb.i_mb_count * sizeof(int16_t);
3495+    locked = x264_opencl_alloc_locked( h, size );
3496+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
3497+    h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
3498+    h->opencl.copies[h->opencl.num_copies].src = locked;
3499+    h->opencl.copies[h->opencl.num_copies].bytes = size;
3500+    h->opencl.num_copies++;
3501+
3502+    size = h->mb.i_mb_height * sizeof(int);
3503+    locked = x264_opencl_alloc_locked( h, size );
3504+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
3505+    h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
3506+    h->opencl.copies[h->opencl.num_copies].src = locked;
3507+    h->opencl.copies[h->opencl.num_copies].bytes = size;
3508+    h->opencl.num_copies++;
3509+
3510+    size = sizeof(int) * 4;
3511+    locked = x264_opencl_alloc_locked( h, size );
3512+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
3513+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
3514+    h->opencl.copies[h->opencl.num_copies].src = locked;
3515+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
3516+    h->opencl.num_copies++;
3517+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
3518+    h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
3519+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
3520+    h->opencl.num_copies++;
3521+
3522+    h->opencl.last_buf = !h->opencl.last_buf;
3523+    return 0;
3524+}
3525+
3526+/* This function was tested emprically on a number of AMD and NV GPUs.  Making a
3527+ * function which returns perfect launch dimensions is impossible; some
3528+ * applications will have self-tuning code to try many possible variables and
3529+ * measure the runtime.  Here we simply make an educated guess based on what we
3530+ * know GPUs typically prefer.
3531+ */
3532+static void x264_optimal_launch_dims( size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
3533+{
3534+    size_t max_work_group = 256;    /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
3535+    size_t preferred_multiple = 64;
3536+    cl_uint num_cus = 6;
3537+
3538+    clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
3539+    clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
3540+    clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
3541+
3542+    ldims[0] = preferred_multiple;
3543+    ldims[1] = 8;
3544+
3545+    /* make ldims[1] an even divisor of gdims[1] */
3546+    while( gdims[1] & (ldims[1] - 1) )
3547+    {
3548+        ldims[0] <<= 1;
3549+        ldims[1] >>= 1;
3550+    }
3551+    /* make total ldims fit under the max work-group dimensions for the device */
3552+    while( ldims[0] * ldims[1] > max_work_group )
3553+    {
3554+        if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
3555+            ldims[1] >>= 1;
3556+        else
3557+            ldims[0] >>= 1;
3558+    }
3559+
3560+    if( ldims[0] > gdims[0] )
3561+    {
3562+        /* remove preferred multiples until we're close to gdims[0] */
3563+        while( gdims[0] + preferred_multiple < ldims[0] )
3564+            ldims[0] -= preferred_multiple;
3565+        gdims[0] = ldims[0];
3566+    }
3567+    else
3568+    {
3569+        /* make gdims an even multiple of ldims */
3570+        gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
3571+        gdims[0] *= ldims[0];
3572+    }
3573+
3574+    /* make ldims smaller to spread work across compute units */
3575+    while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
3576+    {
3577+        if( ldims[0] > preferred_multiple )
3578+            ldims[0] >>= 1;
3579+        else if( ldims[1] > 1 )
3580+            ldims[1] >>= 1;
3581+        else
3582+            break;
3583+    }
3584+    /* for smaller GPUs, try not to abuse their texture cache */
3585+    if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
3586+        ldims[0] = 32;
3587+}
3588+
3589+int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
3590+{
3591+    x264_frame_t *fenc = frames[b];
3592+    x264_frame_t *fref = frames[ref];
3593+
3594+    cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
3595+    cl_mem ref_luma_hpel;
3596+    cl_int status;
3597+
3598+    if( w->weightfn )
3599+    {
3600+        size_t gdims[2];
3601+
3602+        gdims[0] = 8 * h->mb.i_mb_width;
3603+        gdims[1] = 8 * h->mb.i_mb_height;
3604+
3605+        /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
3606+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
3607+        {
3608+            cl_uint arg = 0;
3609+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
3610+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
3611+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
3612+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
3613+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
3614+            OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
3615+
3616+            gdims[0] >>= 1;
3617+            gdims[1] >>= 1;
3618+            if( gdims[0] < 16 || gdims[1] < 16 )
3619+                break;
3620+        }
3621+
3622+        cl_uint arg = 0;
3623+        gdims[0] = 8 * h->mb.i_mb_width;
3624+        gdims[1] = 8 * h->mb.i_mb_height;
3625+
3626+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
3627+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
3628+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
3629+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
3630+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
3631+        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
3632+
3633+        /* Use weighted reference planes for motion search */
3634+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
3635+            ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
3636+        ref_luma_hpel = h->opencl.weighted_luma_hpel;
3637+    }
3638+    else
3639+    {
3640+        /* Use unweighted reference planes for motion search */
3641+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
3642+            ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
3643+        ref_luma_hpel = fref->opencl.luma_hpel;
3644+    }
3645+
3646+    const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
3647+    int b_first_iteration = 1;
3648+    int b_reverse_references = 1;
3649+    int A = 1;
3650+
3651+
3652+    int mb_per_group = 0;
3653+    int cost_local_size = 0;
3654+    int mvc_local_size = 0;
3655+    int mb_width;
3656+
3657+    size_t gdims[2];
3658+    size_t ldims[2];
3659+
3660+    /* scale 0 is 8x8 */
3661+    for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
3662+    {
3663+        mb_width = h->mb.i_mb_width >> scale;
3664+        gdims[0] = mb_width;
3665+        gdims[1] = h->mb.i_mb_height >> scale;
3666+        if( gdims[0] < 2 || gdims[1] < 2 )
3667+            continue;
3668+        gdims[0] <<= 2;
3669+        x264_optimal_launch_dims( gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
3670+
3671+        mb_per_group = (ldims[0] >> 2) * ldims[1];
3672+        cost_local_size = 4 * mb_per_group * sizeof(int16_t);
3673+        mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
3674+        int scaled_me_range = h->param.analyse.i_me_range >> scale;
3675+        int b_shift_index = 1;
3676+
3677+        cl_uint arg = 0;
3678+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
3679+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
3680+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
3681+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
3682+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
3683+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
3684+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
3685+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
3686+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
3687+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
3688+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
3689+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
3690+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
3691+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
3692+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
3693+
3694+        for( int iter = 0; iter < num_iterations[scale]; iter++ )
3695+        {
3696+            OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
3697+
3698+            b_shift_index = 0;
3699+            b_first_iteration = 0;
3700+
3701+            /* alternate top-left vs bot-right MB references at lower scales, so
3702+             * motion field smooths more quickly.
3703+             */
3704+            if( scale > 2 )
3705+                b_reverse_references ^= 1;
3706+            else
3707+                b_reverse_references = 0;
3708+            A = !A;
3709+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
3710+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
3711+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
3712+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
3713+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
3714+        }
3715+    }
3716+
3717+    int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
3718+    cl_uint arg = 0;
3719+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
3720+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
3721+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
3722+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
3723+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
3724+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
3725+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
3726+
3727+    if( b_islist1 )
3728+    {
3729+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
3730+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
3731+    }
3732+    else
3733+    {
3734+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
3735+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
3736+    }
3737+
3738+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
3739+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
3740+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
3741+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
3742+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
3743+
3744+    if( h->opencl.b_device_AMD_SI )
3745+    {
3746+        /* workaround for AMD Southern Island, perform meaningless small copy */
3747+        OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
3748+    }
3749+
3750+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
3751+
3752+    int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
3753+
3754+    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
3755+        x264_opencl_flush( h );
3756+
3757+    char *locked = x264_opencl_alloc_locked( h, mvlen );
3758+    h->opencl.copies[h->opencl.num_copies].src = locked;
3759+    h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
3760+
3761+    if( b_islist1 )
3762+    {
3763+        int mvs_offset = mvlen * (ref - b - 1);
3764+        OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
3765+        h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
3766+    }
3767+    else
3768+    {
3769+        int mvs_offset = mvlen * (b - ref - 1);
3770+        OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
3771+        h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
3772+    }
3773+
3774+    h->opencl.num_copies++;
3775+
3776+    return 0;
3777+}
3778+
3779+int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
3780+{
3781+    cl_int status;
3782+    x264_frame_t *fenc = frames[b];
3783+    x264_frame_t *fref0 = frames[p0];
3784+    x264_frame_t *fref1 = frames[p1];
3785+
3786+    int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
3787+
3788+    /* Tasks for this kernel:
3789+     * 1. Select least cost mode (intra, ref0, ref1)
3790+     *    list_used 0, 1, 2, or 3.  if B frame, do not allow intra
3791+     * 2. if B frame, try bidir predictions.
3792+     * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
3793+     */
3794+    size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
3795+    size_t ldim_bidir[2];
3796+    size_t *ldims = NULL;
3797+    int cost_local_size = 4;
3798+    int satd_local_size = 4;
3799+    if( b < p1 )
3800+    {
3801+        /* For B frames, use 4 threads per MB for BIDIR checks */
3802+        ldims = ldim_bidir;
3803+        gdims[0] <<= 2;
3804+        x264_optimal_launch_dims( gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
3805+        int mb_per_group = (ldims[0] >> 2) * ldims[1];
3806+        cost_local_size = 4 * mb_per_group * sizeof(int16_t);
3807+        satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
3808+    }
3809+
3810+    cl_uint arg = 0;
3811+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
3812+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
3813+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
3814+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
3815+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
3816+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
3817+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
3818+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
3819+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
3820+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
3821+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
3822+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
3823+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
3824+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
3825+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
3826+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
3827+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
3828+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
3829+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
3830+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
3831+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
3832+
3833+    /* Sum costs across rows, atomicAdd down frame */
3834+    size_t gdim[2] = { 256, h->mb.i_mb_height };
3835+    size_t ldim[2] = { 256, 1 };
3836+
3837+    arg = 0;
3838+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
3839+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
3840+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
3841+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
3842+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
3843+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
3844+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
3845+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
3846+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
3847+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
3848+
3849+    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
3850+        x264_opencl_flush( h );
3851+
3852+    int size =  h->mb.i_mb_count * sizeof(int16_t);
3853+    char *locked = x264_opencl_alloc_locked( h, size );
3854+    h->opencl.copies[h->opencl.num_copies].src = locked;
3855+    h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
3856+    h->opencl.copies[h->opencl.num_copies].bytes = size;
3857+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
3858+    h->opencl.num_copies++;
3859+
3860+    size =  h->mb.i_mb_height * sizeof(int);
3861+    locked = x264_opencl_alloc_locked( h, size );
3862+    h->opencl.copies[h->opencl.num_copies].src = locked;
3863+    h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
3864+    h->opencl.copies[h->opencl.num_copies].bytes = size;
3865+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
3866+    h->opencl.num_copies++;
3867+
3868+    size =  4 * sizeof(int);
3869+    locked = x264_opencl_alloc_locked( h, size );
3870+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
3871+    h->opencl.last_buf = !h->opencl.last_buf;
3872+
3873+    h->opencl.copies[h->opencl.num_copies].src = locked;
3874+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
3875+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
3876+    h->opencl.num_copies++;
3877+    h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
3878+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
3879+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
3880+    h->opencl.num_copies++;
3881+
3882+    if( b == p1 ) // P frames only
3883+    {
3884+        h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
3885+        h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
3886+        h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
3887+        h->opencl.num_copies++;
3888+    }
3889+    return 0;
3890+}
3891+
3892+int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b )
3893+{
3894+    if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
3895+        return 0;
3896+    else
3897+    {
3898+        int do_search[2];
3899+        int dist_scale_factor = 128;
3900+        const x264_weight_t *w = x264_weight_none;
3901+
3902+        // avoid duplicating work
3903+        frames[b]->i_cost_est[b-p0][p1-b] = 0;
3904+
3905+        do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
3906+        do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
3907+        if( do_search[0] )
3908+        {
3909+            if( h->param.analyse.i_weighted_pred && b == p1 )
3910+            {
3911+                x264_emms();
3912+                x264_weights_analyse( h, frames[b], frames[p0], 1 );
3913+                w = frames[b]->weight[0];
3914+            }
3915+            frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
3916+        }
3917+        if( do_search[1] )
3918+            frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
3919+        if( b == p1 )
3920+            frames[b]->i_intra_mbs[b-p0] = 0;
3921+        if( p1 != p0 )
3922+            dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
3923+
3924+        frames[b]->i_cost_est[b-p0][p1-b] = 0;
3925+        frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
3926+
3927+        x264_opencl_lowres_init( h, frames[b], lambda );
3928+
3929+        if( do_search[0] )
3930+        {
3931+            x264_opencl_lowres_init( h, frames[p0], lambda );
3932+            x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
3933+        }
3934+        if( do_search[1] )
3935+        {
3936+            x264_opencl_lowres_init( h, frames[p1], lambda );
3937+            x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, w );
3938+        }
3939+        x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor );
3940+        return 1;
3941+    }
3942+}
3943+
3944+#endif
3945diff -r bdffc2c1e85b -r e5e4a79d3f21 encoder/slicetype.c
3946--- a/encoder/slicetype.c	Wed Jul 18 08:33:41 2012 -0700
3947+++ b/encoder/slicetype.c	Mon Aug 20 22:33:23 2012 -0500
3948@@ -36,6 +36,19 @@
3949                                       x264_frame_t **frames, int p0, int p1, int b,
3950                                       int b_intra_penalty );
3951 
3952+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
3953+
3954+#if HAVE_OPENCL
3955+#if _WIN32
3956+#include "windows.h"
3957+#endif
3958+int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda );
3959+int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w );
3960+int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor );
3961+int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b );
3962+void x264_opencl_flush( x264_t *h );
3963+#endif
3964+
3965 static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
3966 {
3967     a->i_qp = X264_LOOKAHEAD_QP;
3968@@ -276,7 +289,7 @@
3969     return cost;
3970 }
3971 
3972-static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
3973+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
3974 {
3975     int i_delta_index = fenc->i_frame - ref->i_frame - 1;
3976     /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
3977@@ -793,96 +806,120 @@
3978         output_inter[0] = h->scratch_buffer2;
3979         output_intra[0] = output_inter[0] + output_buf_size;
3980 
3981-        if( h->param.i_lookahead_threads > 1 )
3982+#if HAVE_OPENCL
3983+        if( h->param.b_opencl )
3984         {
3985-            x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
3986+            x264_opencl_lowres_init(h, fenc, a->i_lambda );
3987+            if( do_search[0] )
3988+            {
3989+                x264_opencl_lowres_init( h, frames[p0], a->i_lambda );
3990+                x264_opencl_motionsearch( h, frames, b, p0, 0, a->i_lambda, w );
3991+            }
3992+            if( do_search[1] )
3993+            {
3994+                x264_opencl_lowres_init( h, frames[p1], a->i_lambda );
3995+                x264_opencl_motionsearch( h, frames, b, p1, 1, a->i_lambda, w );
3996+            }
3997+            if( b != p0 )
3998+                x264_opencl_finalize_cost( h, a->i_lambda, frames, p0, p1, b, dist_scale_factor );
3999+            x264_opencl_flush( h );
4000 
4001+            i_score = fenc->i_cost_est[b-p0][p1-b];
4002+        }
4003+        else
4004+#endif
4005+        {
4006+            if( h->param.i_lookahead_threads > 1 )
4007+            {
4008+                x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
4009+
4010+                for( int i = 0; i < h->param.i_lookahead_threads; i++ )
4011+                {
4012+                    x264_t *t = h->lookahead_thread[i];
4013+
4014+                    /* FIXME move this somewhere else */
4015+                    t->mb.i_me_method = h->mb.i_me_method;
4016+                    t->mb.i_subpel_refine = h->mb.i_subpel_refine;
4017+                    t->mb.b_chroma_me = h->mb.b_chroma_me;
4018+
4019+                    s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
4020+                        output_inter[i], output_intra[i] };
4021+
4022+                    t->i_threadslice_start = ((h->mb.i_mb_height *  i    + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
4023+                    t->i_threadslice_end   = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
4024+
4025+                    int thread_height = t->i_threadslice_end - t->i_threadslice_start;
4026+                    int thread_output_size = thread_height + NUM_INTS;
4027+                    memset( output_inter[i], 0, thread_output_size * sizeof(int) );
4028+                    memset( output_intra[i], 0, thread_output_size * sizeof(int) );
4029+                    output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
4030+
4031+                    output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
4032+                    output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
4033+
4034+                    x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] );
4035+                }
4036+                for( int i = 0; i < h->param.i_lookahead_threads; i++ )
4037+                    x264_threadpool_wait( h->lookaheadpool, &s[i] );
4038+            }
4039+            else
4040+            {
4041+                h->i_threadslice_start = 0;
4042+                h->i_threadslice_end = h->mb.i_mb_height;
4043+                memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
4044+                memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
4045+                output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
4046+                x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
4047+                    output_inter[0], output_intra[0] };
4048+                x264_slicetype_slice_cost( &s );
4049+            }
4050+
4051+            /* Sum up accumulators */
4052+            if( b == p1 )
4053+                fenc->i_intra_mbs[b-p0] = 0;
4054+            if( !fenc->b_intra_calculated )
4055+            {
4056+                fenc->i_cost_est[0][0] = 0;
4057+                fenc->i_cost_est_aq[0][0] = 0;
4058+            }
4059+            fenc->i_cost_est[b-p0][p1-b] = 0;
4060+            fenc->i_cost_est_aq[b-p0][p1-b] = 0;
4061+
4062+            int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b];
4063+            int *row_satd_intra = fenc->i_row_satds[0][0];
4064             for( int i = 0; i < h->param.i_lookahead_threads; i++ )
4065             {
4066-                x264_t *t = h->lookahead_thread[i];
4067+                if( b == p1 )
4068+                    fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS];
4069+                if( !fenc->b_intra_calculated )
4070+                {
4071+                    fenc->i_cost_est[0][0] += output_intra[i][COST_EST];
4072+                    fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ];
4073+                }
4074 
4075-                /* FIXME move this somewhere else */
4076-                t->mb.i_me_method = h->mb.i_me_method;
4077-                t->mb.i_subpel_refine = h->mb.i_subpel_refine;
4078-                t->mb.b_chroma_me = h->mb.b_chroma_me;
4079+                fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST];
4080+                fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ];
4081 
4082-                s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
4083-                                                 output_inter[i], output_intra[i] };
4084-
4085-                t->i_threadslice_start = ((h->mb.i_mb_height *  i    + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
4086-                t->i_threadslice_end   = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
4087-
4088-                int thread_height = t->i_threadslice_end - t->i_threadslice_start;
4089-                int thread_output_size = thread_height + NUM_INTS;
4090-                memset( output_inter[i], 0, thread_output_size * sizeof(int) );
4091-                memset( output_intra[i], 0, thread_output_size * sizeof(int) );
4092-                output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
4093-
4094-                output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
4095-                output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
4096-
4097-                x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] );
4098-            }
4099-            for( int i = 0; i < h->param.i_lookahead_threads; i++ )
4100-                x264_threadpool_wait( h->lookaheadpool, &s[i] );
4101-        }
4102-        else
4103-        {
4104-            h->i_threadslice_start = 0;
4105-            h->i_threadslice_end = h->mb.i_mb_height;
4106-            memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
4107-            memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
4108-            output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
4109-            x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
4110-                                                                 output_inter[0], output_intra[0] };
4111-            x264_slicetype_slice_cost( &s );
4112-        }
4113-
4114-        /* Sum up accumulators */
4115-        if( b == p1 )
4116-            fenc->i_intra_mbs[b-p0] = 0;
4117-        if( !fenc->b_intra_calculated )
4118-        {
4119-            fenc->i_cost_est[0][0] = 0;
4120-            fenc->i_cost_est_aq[0][0] = 0;
4121-        }
4122-        fenc->i_cost_est[b-p0][p1-b] = 0;
4123-        fenc->i_cost_est_aq[b-p0][p1-b] = 0;
4124-
4125-        int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b];
4126-        int *row_satd_intra = fenc->i_row_satds[0][0];
4127-        for( int i = 0; i < h->param.i_lookahead_threads; i++ )
4128-        {
4129-            if( b == p1 )
4130-                fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS];
4131-            if( !fenc->b_intra_calculated )
4132-            {
4133-                fenc->i_cost_est[0][0] += output_intra[i][COST_EST];
4134-                fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ];
4135+                if( h->param.rc.i_vbv_buffer_size )
4136+                {
4137+                    int row_count = output_inter[i][NUM_ROWS];
4138+                    memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) );
4139+                    if( !fenc->b_intra_calculated )
4140+                        memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) );
4141+                    row_satd_inter += row_count;
4142+                    row_satd_intra += row_count;
4143+                }
4144             }
4145 
4146-            fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST];
4147-            fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ];
4148+            i_score = fenc->i_cost_est[b-p0][p1-b];
4149+            if( b != p1 )
4150+                i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
4151+            else
4152+                fenc->b_intra_calculated = 1;
4153 
4154-            if( h->param.rc.i_vbv_buffer_size )
4155-            {
4156-                int row_count = output_inter[i][NUM_ROWS];
4157-                memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) );
4158-                if( !fenc->b_intra_calculated )
4159-                    memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) );
4160-                row_satd_inter += row_count;
4161-                row_satd_intra += row_count;
4162-            }
4163+            fenc->i_cost_est[b-p0][p1-b] = i_score;
4164+            x264_emms();
4165         }
4166-
4167-        i_score = fenc->i_cost_est[b-p0][p1-b];
4168-        if( b != p1 )
4169-            i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
4170-        else
4171-            fenc->b_intra_calculated = 1;
4172-
4173-        fenc->i_cost_est[b-p0][p1-b] = i_score;
4174-        x264_emms();
4175     }
4176 
4177     if( b_intra_penalty )
4178@@ -1447,6 +1484,62 @@
4179         return;
4180     }
4181 
4182+#if HAVE_OPENCL
4183+
4184+#if _WIN32
4185+    int my_pri = THREAD_PRIORITY_NORMAL, ocl_pri = THREAD_PRIORITY_NORMAL;
4186+
4187+    /* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not
4188+     * defined in the OpenCL headers shipped with NVIDIA drivers.  We need
4189+     * to be able to compile on an NVIDIA machine and run optimally on an
4190+     * AMD GPU.
4191+     */
4192+#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
4193+#endif
4194+
4195+    if( h->param.b_opencl )
4196+    {
4197+#if _WIN32
4198+        /* Temporarily boost priority of this lookahead thread and the OpenCL
4199+         * driver's thread until the end of this function.  On AMD GPUs this
4200+         * greatly reduces the latency of enqueuing kernels and getting results
4201+         * on Windows.
4202+         */
4203+        HANDLE id = GetCurrentThread();
4204+        my_pri = GetThreadPriority( id );
4205+        SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
4206+        cl_int status = clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
4207+        if( status == CL_SUCCESS )
4208+        {
4209+            ocl_pri = GetThreadPriority( id );
4210+            SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
4211+        }
4212+#endif
4213+
4214+        int b_work_done = 0;
4215+
4216+        /* precalculate intra and I frames */
4217+        for( int i = 0; i <= num_frames; i++ )
4218+        {
4219+            b_work_done |= x264_opencl_lowres_init( h, frames[i], a.i_lambda );
4220+        }
4221+
4222+        /* Because of weightp, we cannot precalculate P frame costs or backwards
4223+         * motion searches.  And because motion searches are never
4224+         * re-calculated, we can't to non-weightp searches for B frames either.
4225+         * So we essentially to all cost calculations synchronously as the
4226+         * lookahead thread determines it needs them.  If some enterprising
4227+         * individual were to do weightp analysis on the GPU, we could do more
4228+         * pre-calculations asynchronously.
4229+         */
4230+        if( b_work_done )
4231+        {
4232+            /* weightp estimation requires intra costs to be on the CPU */
4233+            x264_opencl_flush( h );
4234+        }
4235+    }
4236+#endif
4237+
4238     if( h->param.i_bframe )
4239     {
4240         if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
4241@@ -1480,6 +1573,18 @@
4242                     continue;
4243                 }
4244 
4245+#if HAVE_OPENCL
4246+                if( h->param.b_opencl )
4247+                {
4248+                    int b_work_done = 0;
4249+                    b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+2, i+1 );
4250+                    b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+1, i+1 );
4251+                    b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+1, i+2, i+2 );
4252+                    if( b_work_done )
4253+                        x264_opencl_flush( h );
4254+                }
4255+#endif
4256+
4257                 cost1b1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+1, 0 );
4258                 cost1p0 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+1, i+1, 0 );
4259                 cost2p0 = x264_slicetype_frame_cost( h, &a, frames, i+1, i+2, i+2, 0 );
4260@@ -1562,6 +1667,19 @@
4261     /* Restore frametypes for all frames that haven't actually been decided yet. */
4262     for( int j = reset_start; j <= num_frames; j++ )
4263         frames[j]->i_type = X264_TYPE_AUTO;
4264+
4265+#if HAVE_OPENCL
4266+#if _WIN32
4267+    if( h->param.b_opencl )
4268+    {
4269+        HANDLE id = GetCurrentThread();
4270+        SetThreadPriority( id, my_pri );
4271+        cl_int status = clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
4272+        if( status == CL_SUCCESS )
4273+            SetThreadPriority( id, ocl_pri );
4274+    }
4275+#endif
4276+#endif
4277 }
4278 
4279 void x264_slicetype_decide( x264_t *h )
4280diff -r bdffc2c1e85b -r e5e4a79d3f21 x264.c
4281--- a/x264.c	Wed Jul 18 08:33:41 2012 -0700
4282+++ b/x264.c	Mon Aug 20 22:33:23 2012 -0500
4283@@ -806,6 +806,8 @@
4284         "                                  as opposed to letting them select different algorithms\n" );
4285     H2( "      --asm <integer>         Override CPU detection\n" );
4286     H2( "      --no-asm                Disable all CPU optimizations\n" );
4287+    H2( "      --opencl                Enable use of OpenCL\n" );
4288+    H2( "      --clbin-file <string>   Specify path of compiled OpenCL kernel cache\n" );
4289     H2( "      --visualize             Show MB types overlayed on the encoded video\n" );
4290     H2( "      --dump-yuv <string>     Save reconstructed frames\n" );
4291     H2( "      --sps-id <integer>      Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
4292@@ -910,6 +912,8 @@
4293     { "ref",         required_argument, NULL, 'r' },
4294     { "asm",         required_argument, NULL, 0 },
4295     { "no-asm",            no_argument, NULL, 0 },
4296+    { "opencl",            no_argument, NULL, 1 },
4297+    { "clbin-file",  required_argument, NULL, 0 },
4298     { "sar",         required_argument, NULL, 0 },
4299     { "fps",         required_argument, NULL, OPT_FPS },
4300     { "frames",      required_argument, NULL, OPT_FRAMES },
4301diff -r bdffc2c1e85b -r e5e4a79d3f21 x264.h
4302--- a/x264.h	Wed Jul 18 08:33:41 2012 -0700
4303+++ b/x264.h	Mon Aug 20 22:33:23 2012 -0500
4304@@ -41,7 +41,7 @@
4305 
4306 #include "x264_config.h"
4307 
4308-#define X264_BUILD 125
4309+#define X264_BUILD 126
4310 
4311 /* Application developers planning to link against a shared library version of
4312  * libx264 from a Microsoft Visual Studio or similar development environment
4313@@ -463,6 +463,9 @@
4314 
4315     int b_fake_interlaced;
4316 
4317+    int b_opencl;            /* use OpenCL when available */
4318+    char *psz_clbin_file;    /* compiled OpenCL kernel cache file */
4319+
4320     /* Slicing parameters */
4321     int i_slice_max_size;    /* Max size per slice in bytes; includes estimated NAL overhead. */
4322     int i_slice_max_mbs;     /* Max number of MBs per slice; overrides i_slice_count. */