DeepFaceLive/xlib/avecl/_internal/HKernel.py

import numpy as np

class HKernel:
    """
    Helper functions for Kernels
    """

    _np_dtype_to_cl = { np.bool_   : 'bool',
                        np.int8    : 'char',
                        np.uint8   : 'uchar',
                        np.int16   : 'short',
                        np.uint16  : 'ushort',
                        np.int32   : 'int',
                        np.uint32  : 'uint',
                        np.int64   : 'long',
                        np.uint64  : 'ulong',
                        np.float16 : 'half',
                        np.float32 : 'float'
                      }

    @staticmethod
    def np_dtype_to_cl(dtype : np.dtype):
        """
        returns string opencl type  from numpy dtype

        example np.float32  -> 'float'
                np.uint8    -> 'unsigned char'
        """
        return HKernel._np_dtype_to_cl[np.dtype(dtype).type]

    @staticmethod
    def define_scalar_func_arg(name, dtype : np.dtype):
        """
        """
        return f'{HKernel._np_dtype_to_cl[np.dtype(dtype).type]} {name}'

    @staticmethod
    def define_tensor_type(name, dtype : np.dtype):
        """
        Returns a definitions for operations with tensor

        example for 'O', np.float16:

        #define O_PTR_NAME p_O
        #define O_PTR_TYPE half
        #define O_PTR_TYPE2 half2
        #define O_PTR_TYPE3 half3
        #define O_PTR_TYPE4 half4
        #define O_PTR_TYPE8 half8
        #define O_PTR_TYPE16 half16
        #define O_TYPE float
        #define O_TYPE2 float2
        #define O_TYPE3 float3
        #define O_TYPE4 float4
        #define O_TYPE8 float8
        #define O_TYPE16 float16
        #define O_GLOBAL_LOAD(offset)   vload_half  (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_LOAD2(offset)  vload_half2 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_LOAD3(offset)  vload_half3 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_LOAD4(offset)  vload_half4 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_LOAD8(offset)  vload_half8 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_STORE(offset,value)   vstore_half  ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_STORE2(offset,value)  vstore_half2 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_STORE3(offset,value)  vstore_half3 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_STORE4(offset,value)  vstore_half4 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_STORE8(offset,value)  vstore_half8 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
        #define O_TO_FLOATX(x) ((float)x)
        """
        name_upper = name.upper()

        dtype = np.dtype(dtype).type

        out = [f'#define {name.upper()}_PTR_NAME p_{name.upper()}']

        if dtype == np.float16:
            out += [f'#define {name_upper}_PTR_TYPE half']
            out += [f'#define {name_upper}_PTR_TYPE2 half2']
            out += [f'#define {name_upper}_PTR_TYPE3 half3']
            out += [f'#define {name_upper}_PTR_TYPE4 half4']
            out += [f'#define {name_upper}_PTR_TYPE8 half8']
            out += [f'#define {name_upper}_PTR_TYPE16 half16']
            out += [f'#define {name_upper}_TYPE {HKernel.np_dtype_to_cl(np.float32)}']
            out += [f'#define {name_upper}_TYPE2 {HKernel.np_dtype_to_cl(np.float32)}2']
            out += [f'#define {name_upper}_TYPE3 {HKernel.np_dtype_to_cl(np.float32)}3']
            out += [f'#define {name_upper}_TYPE4 {HKernel.np_dtype_to_cl(np.float32)}4']
            out += [f'#define {name_upper}_TYPE8 {HKernel.np_dtype_to_cl(np.float32)}8']
            out += [f'#define {name_upper}_TYPE16 {HKernel.np_dtype_to_cl(np.float32)}16']


            out += [f'#define {name_upper}_GLOBAL_LOAD(offset)   vload_half  (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
            out += [f'#define {name_upper}_GLOBAL_LOAD2(offset)  vload_half2 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
            out += [f'#define {name_upper}_GLOBAL_LOAD3(offset)  vload_half3 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
            out += [f'#define {name_upper}_GLOBAL_LOAD4(offset)  vload_half4 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
            out += [f'#define {name_upper}_GLOBAL_LOAD8(offset)  vload_half8 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
            out += [f'#define {name_upper}_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']

            out += [f'#define {name_upper}_GLOBAL_STORE(offset,value)   vstore_half  ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
            out += [f'#define {name_upper}_GLOBAL_STORE2(offset,value)  vstore_half2 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
            out += [f'#define {name_upper}_GLOBAL_STORE3(offset,value)  vstore_half3 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
            out += [f'#define {name_upper}_GLOBAL_STORE4(offset,value)  vstore_half4 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
            out += [f'#define {name_upper}_GLOBAL_STORE8(offset,value)  vstore_half8 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
            out += [f'#define {name_upper}_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']

        else:
            out += [f'#define {name_upper}_PTR_TYPE {HKernel.np_dtype_to_cl(dtype)}']
            out += [f'#define {name_upper}_PTR_TYPE2 {HKernel.np_dtype_to_cl(dtype)}2']
            out += [f'#define {name_upper}_PTR_TYPE3 {HKernel.np_dtype_to_cl(dtype)}3']
            out += [f'#define {name_upper}_PTR_TYPE4 {HKernel.np_dtype_to_cl(dtype)}4']
            out += [f'#define {name_upper}_PTR_TYPE8 {HKernel.np_dtype_to_cl(dtype)}8']
            out += [f'#define {name_upper}_PTR_TYPE16 {HKernel.np_dtype_to_cl(dtype)}16']
            out += [f'#define {name_upper}_TYPE {HKernel.np_dtype_to_cl(dtype)}']
            out += [f'#define {name_upper}_TYPE2 {HKernel.np_dtype_to_cl(dtype)}2']
            out += [f'#define {name_upper}_TYPE3 {HKernel.np_dtype_to_cl(dtype)}3']
            out += [f'#define {name_upper}_TYPE4 {HKernel.np_dtype_to_cl(dtype)}4']
            out += [f'#define {name_upper}_TYPE8 {HKernel.np_dtype_to_cl(dtype)}8']
            out += [f'#define {name_upper}_TYPE16 {HKernel.np_dtype_to_cl(dtype)}16']

            out += [f'#define {name_upper}_GLOBAL_LOAD(offset)   {name_upper}_PTR_NAME[(offset)]']
            out += [f'#define {name_upper}_GLOBAL_LOAD2(offset)  {name_upper}_PTR_NAME[(offset)]']
            out += [f'#define {name_upper}_GLOBAL_LOAD3(offset)  {name_upper}_PTR_NAME[(offset)]']
            out += [f'#define {name_upper}_GLOBAL_LOAD4(offset)  {name_upper}_PTR_NAME[(offset)]']
            out += [f'#define {name_upper}_GLOBAL_LOAD8(offset)  {name_upper}_PTR_NAME[(offset)]']
            out += [f'#define {name_upper}_GLOBAL_LOAD16(offset) {name_upper}_PTR_NAME[(offset)]']

            out += [f'#define {name_upper}_GLOBAL_STORE(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
            out += [f'#define {name_upper}_GLOBAL_STORE2(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
            out += [f'#define {name_upper}_GLOBAL_STORE3(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
            out += [f'#define {name_upper}_GLOBAL_STORE4(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
            out += [f'#define {name_upper}_GLOBAL_STORE8(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
            out += [f'#define {name_upper}_GLOBAL_STORE16(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']

        if dtype in [np.float32]:
            out += [f'#define {name_upper}_TO_FLOATX(x) x']
        elif dtype in [np.bool_, np.int8, np.uint8, np.int16, np.uint16, np.int32,np.uint32, np.float16]:
            out += [f'#define {name_upper}_TO_FLOATX(x) ((float)x)']
        elif dtype in [np.int64,np.uint64]:
            out += [f'#define {name_upper}_TO_FLOATX(x) ((double)x)']
        return '\n'.join(out)

    @staticmethod
    def define_ndim_idx(ndim):
        """
        define macro to calculate index for n-dim shape

        example for ndim=3

        #define NDIM3_IDX(t0,t1,t2,T0,T1,T2) (((size_t)(t0))*T1*T2+((size_t)(t1))*T2+((size_t)(t2)))
        #define NDIM3_IDX_MOD(t0,t1,t2,T0,T1,T2) (((size_t)(t0) % T0)*T1*T2+((size_t)(t1) % T1)*T2+((size_t)(t2) % T2))
        """

        out = [f'#define NDIM{ndim}_IDX(' + \
                ','.join([f't{i}' for i in range(ndim)] + [f'T{i}' for i in range(ndim)]) + \
                ') (' + '+'.join([f'((size_t)(t{i}))' + ''.join(f'*T{j}' for j in range(i+1,ndim)) for i in range(ndim) ]) + ')']

        out +=[f'#define NDIM{ndim}_IDX_MOD(' + \
                ','.join([f't{i}' for i in range(ndim)] + [f'T{i}' for i in range(ndim)]) + \
                ') (' + '+'.join([f'( (((size_t)(t{i}) % T{i}) + T{i}) % T{i} )    ' + ''.join(f'*T{j}' for j in range(i+1,ndim)) for i in range(ndim) ]) + ')']

        return '\n'.join(out)

    @staticmethod
    def define_tensor_shape(name, shape, axes_symbols=None):
        """
        Returns a definitions for operations with tensor shape

        example for 'O', (2,3),

        #define O0 2
        #define O1 3
        #define Om1 3
        #define Om2 2
        #define O_IDX(o0,o1) (((size_t)(o0))*3+((size_t)(o1)))
        #define O_IDX_MOD(o0,o1) (((size_t)(o0) % 2)*3+((size_t)(o1) % 3))
        """
        shape = tuple(shape)
        ndim = len(shape)
        name_upper = name.upper()
        name_lower = name.lower()

        if axes_symbols is None:
            axes_symbols = "".join([str(i) for i in range(ndim)])
        axes_symbols = axes_symbols.upper()

        out =  [f'#define {name_upper}{axes_symbols[i]} {shape[i]}' for i in range(ndim)]
        out += [f'#define {name_upper}m{i} {shape[-i]}' for i in range(1,ndim+1)]

        out += [f'#define {name_upper}_IDX({HKernel.axes_seq_enum(name, ndim)}) (' + \
                 '+'.join([f'((size_t)({name_lower}{i}))'              + ''.join(f'*{shape[j]}' for j in range(i+1,ndim)) for i in range(ndim)]) + ')']

        out += [f'#define {name_upper}_IDX_MOD({HKernel.axes_seq_enum(name, ndim)}) (' + \
                 '+'.join([f'( (( (size_t)({name_lower}{i}) % {shape[i]} ) + {shape[i]}) % {shape[i]} )' + ''.join(f'*{shape[j]}' for j in range(i+1,ndim)) for i in range(ndim)]) + ')']

        return '\n'.join(out)

    @staticmethod
    def define_tensor(name, shape, dtype : np.dtype, axes_symbols=None):
        """
        Returns a definitions for operations with tensor

        arguments

            name     text

            shape           Iterable

            dtype           np.dtype

            axes_symbols(None)  string of symbols.
                                None -> numeric symbols will be used

        example for 'O', (2,4), np.float16

        #define O_PTR_NAME p_O
        #define O_PTR_TYPE half
        #define O_PTR_TYPE2 half2
        #define O_PTR_TYPE3 half3
        #define O_PTR_TYPE4 half4
        #define O_PTR_TYPE8 half8
        #define O_PTR_TYPE16 half16
        #define O_TYPE float
        #define O_TYPE2 float2
        #define O_TYPE3 float3
        #define O_TYPE4 float4
        #define O_TYPE8 float8
        #define O_TYPE16 float16
        #define O_GLOBAL_LOAD(offset)   vload_half  (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_LOAD2(offset)  vload_half2 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_LOAD3(offset)  vload_half3 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_LOAD4(offset)  vload_half4 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_LOAD8(offset)  vload_half8 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_STORE(offset,value)   vstore_half  ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_STORE2(offset,value)  vstore_half2 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_STORE3(offset,value)  vstore_half3 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_STORE4(offset,value)  vstore_half4 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_STORE8(offset,value)  vstore_half8 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
        #define O_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
        #define O_TO_FLOATX(x) ((float)x)
        #define O0 2
        #define O1 4
        #define Om1 4
        #define Om2 2
        #define O_IDX(o0,o1) ( (size_t)(o0) )*4 +( o1 )
        #define O_IDX_MOD(o0,o1) ( (size_t)(o0) % 2 )*4 +( (o1) % 4 )
        """
        return'\n'.join([ HKernel.define_tensor_type(name, dtype),
                          HKernel.define_tensor_shape(name, shape, axes_symbols)
                        ])

    @staticmethod
    def define_axes_sizes(axis_letter, axes_sizes):
        """
        Returns definitions of axes sizes

        example for 'O', (4,512,512)
        #define O0 4
        #define O1 512
        #define O2 512
        """
        out = ""
        axes_sizes = tuple(axes_sizes)
        ndim = len(axes_sizes)
        for i in range(ndim):
            out += f'#define {axis_letter.upper()}{i} {axes_sizes[i]}\n'

        return out

    @staticmethod
    def decompose_idx_to_axes_idxs(var_name, tensor_name, ndim):
        """
        decompose a size_t variable to axes indexes.
        Keeps original variable untouched.

        Example for 'gid','O',2

        size_t gid_original = gid;
        size_t o1 = gid % O1; gid /= O1;
        #define om1 o1
        size_t o0 = gid % O0;
        #define om2 o0
        gid = gid_original;
        """
        name_lower = tensor_name.lower()
        name_upper = tensor_name.upper()

        out = [f'size_t {var_name}_original = {var_name};']

        for i in range(ndim-1,-1,-1):
            line = f'size_t {name_lower}{i} = {var_name} % {name_upper}{i};'
            if i > 0:
                line += f' {var_name} /= {name_upper}{i};'
            out += [line]
            out += [f'#define {name_lower}m{ndim-i} {name_lower}{i}']

        out += [f'{var_name} = {var_name}_original;']
        return '\n'.join(out)

    @staticmethod
    def axes_order_enum(tensor_name, axes_order):
        """
        returns axis enumeration with given order

        Example
         ('I', (1,2,0)) returns 'i1,i2,i0'
         ('I', 'HW') return 'ih,iw'
        """
        if isinstance(axes_order, str):
            axes_order = axes_order.lower()
        else:
            axes_order = tuple(axes_order)

        name_lower = tensor_name.lower()

        return ','.join( [ f'{name_lower}{axes_order[axis]}' for axis in range(len(axes_order)) ])

    @staticmethod
    def axes_seq_enum(tensor_name, ndim, new_axis=None, zero_axes=None, suffix=None):
        """
        returns axis sequental enumeration with given ndim

        Example

         ('I', 4) returns 'i0,i1,i2,i3'

         ('I', 4, new_axis=('name',1) ) returns 'i0,name,i1,i2,i3'

         ('I', 3, zero_axes=(1,) ) returns 'i0,0,i2'

         ('I', 2, suffix='ih,iw' ) returns 'i0,i1,ih,iw'
        """
        name_lower = tensor_name.lower()
        if zero_axes is not None:
            axes = [ '0' if axis in zero_axes else f'{name_lower}{axis}' for axis in range(ndim) ]
        else:
            axes = [ f'{name_lower}{axis}' for axis in range(ndim) ]

        if suffix is None:
            suffix = []
        else:
            suffix = [suffix]

        if new_axis is not None:
            name, axis = new_axis
            return','.join(axes[:axis] + [name] + axes[axis:] + suffix)
        else:
            return ','.join(axes+ suffix)

    @staticmethod
    def include_constants_pi():
        """
        defines PI constants

         PI_F
         PI_2_F
         PI_4_F
        """
        return f"""
#define  PI_F          3.14159274101257f
#define  PI_2_F        1.57079637050629f
#define  PI_4_F        0.78539818525314f
"""

    @staticmethod
    def include_hash():
        """
        returns hash functions:

         uint  hash_uint_uint(uint v)
         ulong hash_ulong_from_ulong(ulong x)
         float hash_float_from_uint(uint x)
         double hash_double_from_ulong(ulong x)
        """

        return f"""

#define UIF (1.0 / (float)(0xffffffffU))

//from Chris Wellons https://nullprogram.com/blog/2018/07/31/ https://www.shadertoy.com/view/WttXWX
uint hash_uint_from_uint(uint x)
{{
    x ^= x >> 17;
    x *= 0xed5ad4bbU;
    x ^= x >> 11;
    x *= 0xac4c1b51U;
    x ^= x >> 15;
    x *= 0x31848babU;
    x ^= x >> 14;
    return x;
}}

ulong hash_ulong_from_ulong(ulong x)
{{
    x ^= x >> 32;
    x *= 0xd6e8feb86659fd93U;
    x ^= x >> 32;
    x *= 0xd6e8feb86659fd93U;
    x ^= x >> 32;
    return x;
}}

float hash_float_from_uint(uint x)
{{
    return hash_uint_from_uint(x) / (float)(0xffffffffU);
}}

double hash_double_from_ulong(ulong x)
{{
    return (double)hash_ulong_from_ulong(x) / (double)(0xffffffffffffffffU);
}}

/*****************************
UNUSED CODE

//---------- PCG hashes from https://www.shadertoy.com/view/XlGcRh
uint hash_uint_uint(uint v)
{{
    uint state = v * 747796405u + 2891336453u;
    uint word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u;
    return (word >> 22u) ^ word;
}}

uint2 hash_uint2_uint2 (uint2 v)
{{
    v = v * 1664525u + 1013904223u;
    v.x += v.y * 1664525u;
    v.y += v.x * 1664525u;
    v ^= v>>16u;
    v.x += v.y * 1664525u;
    v.y += v.x * 1664525u;
    v ^= v>>16u;
    return v;
}}

uint3 hash_uint3_uint3(uint3 v)
{{
    v = v * 1664525u + 1013904223u;
    v.x += v.y*v.z;
    v.y += v.z*v.x;
    v.z += v.x*v.y;
    v ^= v >> 16u;
    v.x += v.y*v.z;
    v.y += v.z*v.x;
    v.z += v.x*v.y;
    return v;
}}

float hash_float_uint(uint v)
{{
	return (float)( hash_uint_uint(v) ) * UIF;
}}

float2 hash_float2_uint (uint v)
{{
    uint2 q = hash_uint2_uint2( (uint2)(v, 1) );
    return (float2)(q.x, q.y) * UIF;
}}

float3 hash_float3_uint (uint v)
{{
    uint3 q = hash_uint3_uint3( (uint3)(v, 1, 1) );
    return (float3)(q.x, q.y, q.z) * UIF;
}}

//---------- Classic hashes used in shaders

float hash_float_float(float p)
{{

    float x = sin(p*12.9898)*43758.5453;
    return x - floor(x);
}}

float hash_float_float2(float2 p)
{{
    float x = sin( dot(p, (float2)(12.9898, 78.233)) )*43758.5453;
    return x - floor(x);
}}

****************************/


"""

__all__ = ['HKernel']