/**
 * @license
 * Copyright 2017 Google LLC. All Rights Reserved.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * =============================================================================
 */
// Import webgl flags.
import './flags_webgl';
import * as tf from '@tensorflow/tfjs-core';
import { div, engine, env, max, range, reshape, scalar, softmax, tensor, tidy, transpose } from '@tensorflow/tfjs-core';
import { backend_util, buffer, kernel_impls, slice_util, util } from '@tensorflow/tfjs-core';
import { DataStorage, KernelBackend, upcastType } from '@tensorflow/tfjs-core';
import { ceilImplCPU, expImplCPU, expm1ImplCPU, floorImplCPU, logImplCPU, rsqrtImplCPU, simpleAbsImplCPU, sliceImplCPU } from './kernel_utils/shared';
const { segment_util } = backend_util;
const split = kernel_impls.split;
const tile = kernel_impls.tile;
const topkImpl = kernel_impls.topkImpl;
const whereImpl = kernel_impls.whereImpl;
import { AddNProgram } from './addn_gpu';
import { AddNPackedProgram } from './addn_packed_gpu';
import { ArgMinMaxProgram } from './argminmax_gpu';
import { ArgMinMaxPackedProgram } from './argminmax_packed_gpu';
import { AvgPool3DBackpropProgram } from './avg_pool_backprop_gpu';
import * as binaryop_gpu from './binaryop_gpu';
import { BinaryOpProgram } from './binaryop_gpu';
import * as binaryop_packed_gpu from './binaryop_packed_gpu';
import { BinaryOpPackedProgram } from './binaryop_packed_gpu';
import { getWebGLContext } from './canvas_util';
import { ClipProgram } from './clip_gpu';
import { ClipPackedProgram } from './clip_packed_gpu';
import { ComplexAbsProgram } from './complex_abs_gpu';
import { Conv2DDerFilterProgram, Conv2DDerInputProgram, Conv3DDerFilterProgram, Conv3DDerInputProgram } from './conv_backprop_gpu';
import { DepthwiseConv2DDerFilterProgram, DepthwiseConv2DDerInputProgram } from './conv_backprop_gpu_depthwise';
import { Conv2DProgram, Conv3DProgram } from './conv_gpu';
import { DepthwiseConv2DProgram } from './conv_gpu_depthwise';
import { DepthwiseConvPacked2DProgram } from './conv_packed_gpu_depthwise';
import { CropAndResizeProgram } from './crop_and_resize_gpu';
import { CumSumProgram } from './cumsum_gpu';
import { DecodeMatrixProgram } from './decode_matrix_gpu';
import { DecodeMatrixPackedProgram } from './decode_matrix_packed_gpu';
import { DepthToSpaceProgram } from './depth_to_space_gpu';
import { DiagProgram } from './diag_gpu';
import { EncodeFloatProgram } from './encode_float_gpu';
import { EncodeFloatPackedProgram } from './encode_float_packed_gpu';
import { EncodeMatrixProgram } from './encode_matrix_gpu';
import { EncodeMatrixPackedProgram } from './encode_matrix_packed_gpu';
import { FillProgram } from './fill_gpu';
import { GatherProgram } from './gather_gpu';
import { GatherNDProgram } from './gather_nd_gpu';
import { GPGPUContext } from './gpgpu_context';
import * as gpgpu_math from './gpgpu_math';
import { Im2ColPackedProgram } from './im2col_packed_gpu';
import { LRNProgram } from './lrn_gpu';
import { LRNGradProgram } from './lrn_grad_gpu';
import { LRNPackedProgram } from './lrn_packed_gpu';
import { MaxPool3DBackpropProgram } from './max_pool_backprop_gpu';
import { MatMulPackedProgram } from './mulmat_packed_gpu';
import { MultinomialProgram } from './multinomial_gpu';
import { OneHotProgram } from './onehot_gpu';
import { PackProgram } from './pack_gpu';
import { PadProgram } from './pad_gpu';
import { PadPackedProgram } from './pad_packed_gpu';
import { Pool3DProgram } from './pool_gpu';
import { ReduceProgram } from './reduce_gpu';
import { ReshapePackedProgram } from './reshape_packed_gpu';
import { ResizeBilinearBackpropProgram } from './resize_bilinear_backprop_gpu';
import { ResizeBilinearProgram } from './resize_bilinear_gpu';
import { ResizeBilinearPackedProgram } from './resize_bilinear_packed_gpu';
import { ResizeNearestNeigborBackpropProgram } from './resize_nearest_neighbor_backprop_gpu';
import { ResizeNearestNeighborProgram } from './resize_nearest_neighbor_gpu';
import { ReverseProgram } from './reverse_gpu';
import { ReversePackedProgram } from './reverse_packed_gpu';
import { ScatterProgram } from './scatter_gpu';
import { SegmentOpProgram } from './segment_gpu';
import { SelectProgram } from './select_gpu';
import { SliceProgram } from './slice_gpu';
import { SlicePackedProgram } from './slice_packed_gpu';
import { StridedSliceProgram } from './strided_slice_gpu';
import * as tex_util from './tex_util';
import { TextureUsage } from './tex_util';
import { TextureManager } from './texture_manager';
import { TileProgram } from './tile_gpu';
import * as unary_op from './unaryop_gpu';
import { UnaryOpProgram } from './unaryop_gpu';
import * as unary_packed_op from './unaryop_packed_gpu';
import { UnaryOpPackedProgram } from './unaryop_packed_gpu';
import { UnpackProgram } from './unpack_gpu';
import * as webgl_util from './webgl_util';
export const EPSILON_FLOAT32 = 1e-7;
export const EPSILON_FLOAT16 = 1e-4;
const binaryCaches = {};
export function getBinaryCache(webGLVersion) {
    if (webGLVersion in binaryCaches) {
        return binaryCaches[webGLVersion];
    }
    binaryCaches[webGLVersion] = {};
    return binaryCaches[webGLVersion];
}
function mapActivationToShaderProgram(activation, packed = false) {
    if (activation === 'linear') {
        if (packed) {
            return unary_packed_op.LINEAR;
        }
        return unary_op.LINEAR;
    }
    else if (activation === 'relu') {
        if (packed) {
            return unary_packed_op.RELU;
        }
        return unary_op.RELU;
    }
    else if (activation === 'elu') {
        if (packed) {
            return unary_packed_op.ELU;
        }
        return unary_op.ELU;
    }
    else if (activation === 'relu6') {
        if (packed) {
            return unary_packed_op.RELU6;
        }
        return unary_op.RELU6;
    }
    else if (activation === 'prelu') {
        if (packed) {
            return binaryop_packed_gpu.PRELU;
        }
        return binaryop_gpu.PRELU;
    }
    throw new Error(`Activation ${activation} has not been implemented for the WebGL backend.`);
}
// Empirically determined constant used to determine size threshold for handing
// off execution to the CPU.
const CPU_HANDOFF_SIZE_THRESHOLD = 128;
// Empirically determined constant used to decide the number of MB on GPU
// before we warn about high memory use. The MB are this constant * screen area
// * dpi / 1024 / 1024.
const BEFORE_PAGING_CONSTANT = 600;
function numMBBeforeWarning() {
    if (env().global.screen == null) {
        return 1024; // 1 GB.
    }
    return (env().global.screen.height * env().global.screen.width *
        window.devicePixelRatio) *
        BEFORE_PAGING_CONSTANT / 1024 / 1024;
}
// Empirically determined minimal shared dimension in matmul before we forward
// to a.mul(b).sum() in order to take advantage of GPU parallelism. See
// https://github.com/tensorflow/tfjs-core/pull/1379 for benchmarks.
export const MATMUL_SHARED_DIM_THRESHOLD = 1000;
export class MathBackendWebGL extends KernelBackend {
    constructor(gpgpu) {
        super();
        // Maps data ids that have a pending read operation, to list of subscribers.
        this.pendingRead = new WeakMap();
        // List of data ids that are scheduled for disposal, but are waiting on a
        // pending read operation.
        this.pendingDisposal = new WeakSet();
        // Used to count the number of 'shallow' sliced tensors that point to the
        // same data id.
        this.dataRefCount = new WeakMap();
        this.numBytesInGPU = 0;
        // Accumulated time spent (including blocking) in uploading data to webgl.
        this.uploadWaitMs = 0;
        // Accumulated time spent (including blocking in downloading data from webgl.
        this.downloadWaitMs = 0;
        this.warnedAboutMemory = false;
        this.warnedAboutCPUBackend = false;
        this.pendingDeletes = 0;
        this.disposed = false;
        if (!env().getBool('HAS_WEBGL')) {
            throw new Error('WebGL is not supported on this device');
        }
        if (gpgpu == null) {
            const gl = getWebGLContext(env().getNumber('WEBGL_VERSION'));
            this.binaryCache = getBinaryCache(env().getNumber('WEBGL_VERSION'));
            this.gpgpu = new GPGPUContext(gl);
            this.canvas = gl.canvas;
            this.gpgpuCreatedLocally = true;
        }
        else {
            this.gpgpu = gpgpu;
            this.binaryCache = {};
            this.gpgpuCreatedLocally = false;
            this.canvas = gpgpu.gl.canvas;
        }
        this.textureManager = new TextureManager(this.gpgpu);
        this.numMBBeforeWarning = numMBBeforeWarning();
        this.texData = new DataStorage(this, engine());
    }
    numDataIds() {
        return this.texData.numDataIds() +
            (this.cpuBackend ? this.cpuBackend.numDataIds() : 0) -
            this.pendingDeletes;
    }
    write(values, shape, dtype) {
        if (env().getBool('WEBGL_CHECK_NUMERICAL_PROBLEMS') ||
            env().getBool('DEBUG')) {
            this.checkNumericalProblems(values);
        }
        if (dtype === 'complex64' && values != null) {
            throw new Error(`Cannot write to a complex64 dtype. ` +
                `Please use tf.complex(real, imag).`);
        }
        const dataId = {};
        this.texData.set(dataId, {
            shape,
            dtype,
            values,
            usage: TextureUsage.UPLOAD,
            refCount: 1,
            complexParentRefCount: 0
        });
        return dataId;
    }
    /** Increase refCount of a `TextureData`. */
    incRef(dataId) {
        const texData = this.texData.get(dataId);
        texData.refCount++;
    }
    /** Decrease refCount of a `TextureData`. */
    decRef(dataId) {
        if (this.texData.has(dataId)) {
            const texData = this.texData.get(dataId);
            texData.refCount--;
        }
    }
    move(dataId, values, shape, dtype) {
        if (env().getBool('DEBUG')) {
            this.checkNumericalProblems(values);
        }
        if (dtype === 'complex64') {
            throw new Error(`Cannot write to a complex64 dtype. ` +
                `Please use tf.complex(real, imag).`);
        }
        this.texData.set(dataId, {
            shape,
            dtype,
            values,
            usage: TextureUsage.UPLOAD,
            refCount: 1,
            complexParentRefCount: 0
        });
    }
    disposeIntermediateTensorInfo(tensorInfo) {
        const dataId = tensorInfo.dataId;
        if (this.texData.has(dataId)) {
            const textureData = this.texData.get(dataId);
            textureData.refCount--;
            if (textureData.refCount < 1) {
                this.disposeData(dataId);
            }
        }
    }
    readSync(dataId) {
        const texData = this.texData.get(dataId);
        const { values, dtype, complexTensorInfos, slice, shape, isPacked } = texData;
        // The presence of `slice` indicates this tensor is a shallow slice of a
        // different tensor, and is using that original tensor's texture. Run
        // `clone` in order to copy that texture and read from it.
        if (slice != null) {
            let program;
            if (isPacked) {
                program = new UnaryOpPackedProgram(shape, unary_op.CLONE);
            }
            else {
                program = new UnaryOpProgram(shape, unary_op.CLONE);
            }
            const res = this.runWebGLProgram(program, [{ dataId, shape, dtype }], dtype);
            const data = this.readSync(res.dataId);
            this.disposeIntermediateTensorInfo(res);
            return data;
        }
        if (values != null) {
            return this.convertAndCacheOnCPU(dataId);
        }
        if (dtype === 'string') {
            return values;
        }
        const shouldTimeProgram = this.activeTimers != null;
        let start;
        if (shouldTimeProgram) {
            start = util.now();
        }
        let result;
        if (dtype === 'complex64') {
            const realValues = this.readSync(complexTensorInfos.real.dataId);
            const imagValues = this.readSync(complexTensorInfos.imag.dataId);
            result = backend_util.mergeRealAndImagArrays(realValues, imagValues);
        }
        else {
            result = this.getValuesFromTexture(dataId);
        }
        if (shouldTimeProgram) {
            this.downloadWaitMs += util.now() - start;
        }
        return this.convertAndCacheOnCPU(dataId, result);
    }
    async read(dataId) {
        if (this.pendingRead.has(dataId)) {
            const subscribers = this.pendingRead.get(dataId);
            return new Promise(resolve => subscribers.push(resolve));
        }
        const texData = this.texData.get(dataId);
        const { values, shape, slice, dtype, complexTensorInfos, isPacked } = texData;
        // The presence of `slice` indicates this tensor is a shallow slice of a
        // different tensor, and is using that original tensor's texture. Run
        // `clone` in order to copy that texture and read from it.
        if (slice != null) {
            let program;
            if (isPacked) {
                program = new UnaryOpPackedProgram(shape, unary_op.CLONE);
            }
            else {
                program = new UnaryOpProgram(shape, unary_op.CLONE);
            }
            const res = this.runWebGLProgram(program, [{ dataId, shape, dtype }], dtype);
            const data = this.read(res.dataId);
            this.disposeIntermediateTensorInfo(res);
            return data;
        }
        if (values != null) {
            return this.convertAndCacheOnCPU(dataId);
        }
        if (!env().getBool('WEBGL_DOWNLOAD_FLOAT_ENABLED') &&
            env().getNumber('WEBGL_VERSION') === 2) {
            throw new Error(`tensor.data() with WEBGL_DOWNLOAD_FLOAT_ENABLED=false and ` +
                `WEBGL_VERSION=2 not yet supported.`);
        }
        let buffer = null;
        let tmpDownloadTarget;
        if (dtype !== 'complex64' && env().get('WEBGL_BUFFER_SUPPORTED')) {
            // Possibly copy the texture into a buffer before inserting a fence.
            tmpDownloadTarget = this.decode(dataId);
            const tmpData = this.texData.get(tmpDownloadTarget.dataId);
            buffer = this.gpgpu.createBufferFromTexture(tmpData.texture, ...tex_util.getDenseTexShape(shape));
        }
        this.pendingRead.set(dataId, []);
        if (dtype !== 'complex64') {
            // Create a fence and wait for it to resolve.
            await this.gpgpu.createAndWaitForFence();
        }
        // Download the values from the GPU.
        let vals;
        if (dtype === 'complex64') {
            const ps = await Promise.all([
                this.read(complexTensorInfos.real.dataId),
                this.read(complexTensorInfos.imag.dataId)
            ]);
            const realValues = ps[0];
            const imagValues = ps[1];
            vals = backend_util.mergeRealAndImagArrays(realValues, imagValues);
        }
        else if (buffer == null) {
            vals = this.getValuesFromTexture(dataId);
        }
        else {
            const size = util.sizeFromShape(shape);
            vals = this.gpgpu.downloadFloat32MatrixFromBuffer(buffer, size);
        }
        if (tmpDownloadTarget != null) {
            this.disposeIntermediateTensorInfo(tmpDownloadTarget);
        }
        const dTypeVals = this.convertAndCacheOnCPU(dataId, vals);
        const subscribers = this.pendingRead.get(dataId);
        this.pendingRead.delete(dataId);
        // Notify all pending reads.
        subscribers.forEach(resolve => resolve(dTypeVals));
        if (this.pendingDisposal.has(dataId)) {
            this.pendingDisposal.delete(dataId);
            this.disposeData(dataId);
            this.pendingDeletes--;
        }
        return dTypeVals;
    }
    checkNumericalProblems(values) {
        if (values == null) {
            return;
        }
        for (let i = 0; i < values.length; i++) {
            const num = values[i];
            if (!webgl_util.canBeRepresented(num)) {
                if (env().getBool('WEBGL_RENDER_FLOAT32_CAPABLE')) {
                    throw Error(`The value ${num} cannot be represented with your ` +
                        `current settings. Consider enabling float32 rendering: ` +
                        `'tf.env().set('WEBGL_RENDER_FLOAT32_ENABLED', true);'`);
                }
                throw Error(`The value ${num} cannot be represented on this device.`);
            }
        }
    }
    getValuesFromTexture(dataId) {
        const { shape, dtype, isPacked } = this.texData.get(dataId);
        const size = util.sizeFromShape(shape);
        if (env().getBool('WEBGL_DOWNLOAD_FLOAT_ENABLED')) {
            const tmpTarget = this.decode(dataId);
            const tmpData = this.texData.get(tmpTarget.dataId);
            const vals = this.gpgpu
                .downloadMatrixFromPackedTexture(tmpData.texture, ...tex_util.getDenseTexShape(shape))
                .subarray(0, size);
            this.disposeIntermediateTensorInfo(tmpTarget);
            return vals;
        }
        const shouldUsePackedProgram = env().getBool('WEBGL_PACK') && isPacked === true;
        const outputShape = shouldUsePackedProgram ? webgl_util.getShapeAs3D(shape) : shape;
        const program = shouldUsePackedProgram ?
            new EncodeFloatPackedProgram(outputShape) :
            new EncodeFloatProgram(outputShape);
        const output = this.runWebGLProgram(program, [{ shape: outputShape, dtype, dataId }], 'float32');
        const tmpData = this.texData.get(output.dataId);
        const vals = this.gpgpu
            .downloadByteEncodedFloatMatrixFromOutputTexture(tmpData.texture, tmpData.texShape[0], tmpData.texShape[1])
            .subarray(0, size);
        this.disposeIntermediateTensorInfo(output);
        return vals;
    }
    async time(f) {
        const oldActiveTimers = this.activeTimers;
        const newActiveTimers = [];
        let outerMostTime = false;
        if (this.programTimersStack == null) {
            this.programTimersStack = newActiveTimers;
            outerMostTime = true;
        }
        else {
            this.activeTimers.push(newActiveTimers);
        }
        this.activeTimers = newActiveTimers;
        f();
        // needing to split these up because util.flatten only accepts certain types
        const flattenedActiveTimerQueries = util.flatten(this.activeTimers.map((d) => d.query))
            .filter(d => d != null);
        const flattenedActiveTimerNames = util.flatten(this.activeTimers.map((d) => d.name))
            .filter(d => d != null);
        this.activeTimers = oldActiveTimers;
        if (outerMostTime) {
            this.programTimersStack = null;
        }
        const res = {
            uploadWaitMs: this.uploadWaitMs,
            downloadWaitMs: this.downloadWaitMs,
            kernelMs: null,
            wallMs: null // will be filled by the engine
        };
        if (env().getNumber('WEBGL_DISJOINT_QUERY_TIMER_EXTENSION_RELIABLE') > 0) {
            const kernelMs = await Promise.all(flattenedActiveTimerQueries);
            res['kernelMs'] = util.sum(kernelMs);
            res['getExtraProfileInfo'] = () => kernelMs.map((d, i) => ({ name: flattenedActiveTimerNames[i], ms: d }))
                .map(d => `${d.name}: ${d.ms}`)
                .join(', ');
        }
        else {
            res['kernelMs'] = {
                error: 'WebGL query timers are not supported in this environment.'
            };
        }
        this.uploadWaitMs = 0;
        this.downloadWaitMs = 0;
        return res;
    }
    memory() {
        return {
            unreliable: false,
            numBytesInGPU: this.numBytesInGPU,
            numBytesInGPUAllocated: this.textureManager.numBytesAllocated,
            numBytesInGPUFree: this.textureManager.numBytesFree
        };
    }
    startTimer() {
        if (env().getNumber('WEBGL_DISJOINT_QUERY_TIMER_EXTENSION_RELIABLE') > 0) {
            return this.gpgpu.beginQuery();
        }
        return { startMs: util.now(), endMs: null };
    }
    endTimer(query) {
        if (env().getNumber('WEBGL_DISJOINT_QUERY_TIMER_EXTENSION_RELIABLE') > 0) {
            this.gpgpu.endQuery();
            return query;
        }
        query.endMs = util.now();
        return query;
    }
    async getQueryTime(query) {
        if (env().getNumber('WEBGL_DISJOINT_QUERY_TIMER_EXTENSION_RELIABLE') > 0) {
            return this.gpgpu.waitForQueryAndGetTime(query);
        }
        const timerQuery = query;
        return timerQuery.endMs - timerQuery.startMs;
    }
    disposeData(dataId) {
        if (this.pendingDisposal.has(dataId)) {
            return;
        }
        if (this.pendingRead.has(dataId)) {
            this.pendingDisposal.add(dataId);
            this.pendingDeletes++;
            return;
        }
        // No-op if already disposed.
        if (!this.texData.has(dataId)) {
            return;
        }
        // Trying to dispose a textureData that has a 'kept' refCount, e.g. trying
        // to dispose a tensor whose data bucket is shared with a complex tensor. In
        // this case we are removing a reference to the textureData, but we
        // shouldn't actually dispose the texture.
        if (this.texData.get(dataId).complexParentRefCount > 0) {
            this.texData.get(dataId).refCount--;
            return;
        }
        this.releaseGPUData(dataId);
        const { complexTensorInfos } = this.texData.get(dataId);
        if (complexTensorInfos != null) {
            this.texData.get(complexTensorInfos.real.dataId).complexParentRefCount--;
            this.disposeIntermediateTensorInfo(complexTensorInfos.real);
            this.texData.get(complexTensorInfos.imag.dataId).complexParentRefCount--;
            this.disposeIntermediateTensorInfo(complexTensorInfos.imag);
        }
        this.texData.delete(dataId);
    }
    releaseGPUData(dataId) {
        const { texture, dtype, texShape, usage, isPacked, slice } = this.texData.get(dataId);
        const key = slice && slice.origDataId || dataId;
        const refCount = this.dataRefCount.get(key);
        if (refCount > 1) {
            this.dataRefCount.set(key, refCount - 1);
        }
        else {
            this.dataRefCount.delete(key);
            if (texture != null) {
                this.numBytesInGPU -= this.computeBytes(texShape, dtype);
                this.textureManager.releaseTexture(texture, texShape, usage, isPacked);
            }
        }
        const texData = this.texData.get(dataId);
        texData.texture = null;
        texData.texShape = null;
        texData.isPacked = false;
        texData.slice = null;
    }
    getTexture(dataId) {
        this.uploadToGPU(dataId);
        return this.texData.get(dataId).texture;
    }
    /**
     * Returns internal information for the specific data bucket. Used in unit
     * tests.
     */
    getDataInfo(dataId) {
        return this.texData.get(dataId);
    }
    getCPUBackend() {
        if (!env().getBool('WEBGL_CPU_FORWARD')) {
            return null;
        }
        if (this.cpuBackend == null) {
            this.cpuBackend = engine().findBackend('cpu');
        }
        return this.cpuBackend;
    }
    /*
    Tests whether all the inputs to an op are small and on the CPU. This heuristic
    determines when it would be faster to execute a kernel on the CPU. WebGL
    kernels opt into running this check and forwarding when appropriate.
    TODO(https://github.com/tensorflow/tfjs/issues/872): Develop a more
    sustainable strategy for optimizing backend execution of ops.
     */
    shouldExecuteOnCPU(inputs, sizeThreshold = CPU_HANDOFF_SIZE_THRESHOLD) {
        const cpuBackend = this.getCPUBackend();
        if (!this.warnedAboutCPUBackend && cpuBackend == null) {
            console.warn('Your application contains ops that are small enough to be ' +
                'executed on the CPU backend, however the CPU backend cannot ' +
                'be found. Consider importing the CPU backend ' +
                '(@tensorflow/tfjs-backend-cpu) for better performance.');
            this.warnedAboutCPUBackend = true;
        }
        return cpuBackend != null &&
            inputs.every(input => this.texData.get(input.dataId).texture == null &&
                util.sizeFromShape(input.shape) < sizeThreshold);
    }
    getGPGPUContext() {
        return this.gpgpu;
    }
    slice(x, begin, size) {
        if (this.shouldExecuteOnCPU([x])) {
            const outValues = sliceImplCPU(this.texData.get(x.dataId).values, begin, size, x.shape, x.dtype);
            return this.makeOutput(size, x.dtype, outValues);
        }
        // Short-circuit computation if the slice is zero-sized.
        if (util.sizeFromShape(size) === 0) {
            return tensor([], size, x.dtype);
        }
        const { isPacked } = this.texData.get(x.dataId);
        const isContinous = slice_util.isSliceContinous(x.shape, begin, size);
        if (isPacked || !isContinous) {
            const program = env().getBool('WEBGL_PACK_ARRAY_OPERATIONS') ?
                new SlicePackedProgram(size) :
                new SliceProgram(size);
            const customSetup = program.getCustomSetupFunc(begin);
            return this.compileAndRun(program, [x], null, customSetup);
        }
        this.uploadToGPU(x.dataId);
        return this.shallowSlice(x, begin, size);
    }
    shallowSlice(x, begin, size) {
        const xTexData = this.texData.get(x.dataId);
        const t = this.makeOutput(size, x.dtype);
        const newTexData = this.texData.get(t.dataId);
        // Copy texture data from the original tensor.
        Object.assign(newTexData, xTexData);
        newTexData.shape = size;
        newTexData.dtype = x.dtype;
        let flatOffset = slice_util.computeFlatOffset(begin, x.strides);
        if (xTexData.slice) {
            // We are slicing an already sliced tensor, so we have to accumulate
            // the offset.
            flatOffset += xTexData.slice.flatOffset;
        }
        newTexData.slice = {
            flatOffset,
            // Point to the original dataId, which is used to do ref counting.
            origDataId: xTexData.slice && xTexData.slice.origDataId || x.dataId
        };
        // Increase the ref count for that data bucket.
        const refCount = this.dataRefCount.get(newTexData.slice.origDataId) || 1;
        this.dataRefCount.set(newTexData.slice.origDataId, refCount + 1);
        return t;
    }
    stridedSlice(x, begin, end, strides) {
        const cpuRes = this.tryRunOnCpuOrThrow([x], () => this.cpuBackend.stridedSlice(x, begin, end, strides));
        if (cpuRes) {
            return cpuRes;
        }
        const outShape = slice_util.computeOutShape(begin, end, strides);
        if (outShape.some(axis => axis === 0)) {
            return tensor([], outShape);
        }
        const program = new StridedSliceProgram(begin, strides, outShape);
        return this.compileAndRun(program, [x]);
    }
    reverse(x, axis) {
        const program = env().getBool('WEBGL_PACK_ARRAY_OPERATIONS') ?
            new ReversePackedProgram(x.shape, axis) :
            new ReverseProgram(x.shape, axis);
        return this.compileAndRun(program, [x]);
    }
    neg(x) {
        const cpuRes = this.tryRunOnCpuOrThrow([x], () => this.cpuBackend.neg(x));
        if (cpuRes) {
            return cpuRes;
        }
        if (env().getBool('WEBGL_PACK_UNARY_OPERATIONS')) {
            return this.packedUnaryOp(x, unary_op.NEG, x.dtype);
        }
        const program = new UnaryOpProgram(x.shape, unary_op.NEG);
        return this.compileAndRun(program, [x]);
    }
    batchMatMul(a, b, transposeA, transposeB) {
        const outerShapeA = transposeA ? a.shape[2] : a.shape[1];
        const outerShapeB = transposeB ? b.shape[1] : b.shape[2];
        const sharedDim = transposeA ? a.shape[1] : a.shape[2];
        const batch = Math.max(a.shape[0], b.shape[0]);
        // Since the matrices are vectors, it is faster to call mul().sum()
        // because sum() is O(sqrt(N)) due to divide-and-conquer.
        if ((outerShapeA === 1 || outerShapeB === 1) &&
            sharedDim > MATMUL_SHARED_DIM_THRESHOLD) {
            if (transposeA) {
                a = transpose(a, [0, 2, 1]);
            }
            if (transposeB) {
                b = transpose(b, [0, 2, 1]);
            }
            const a3D = outerShapeB === 1 ? a : a.as3D(batch, sharedDim, 1);
            const axis = outerShapeB === 1 ? 2 : 1;
            const b3D = outerShapeB === 1 ? b.as3D(batch, 1, sharedDim) : b;
            // TODO(annxingyuan): Call multiply directly as part of batchMatMul
            // modularization.
            const product = tf.mul(a3D, b3D);
            return product.sum(axis, true /* keepDims */);
        }
        const dtype = upcastType(a.dtype, b.dtype);
        const program = new MatMulPackedProgram(a.shape, b.shape, [batch, outerShapeA, outerShapeB], transposeA, transposeB);
        return this.compileAndRun(program, [a, b], dtype);
    }
    fusedBatchMatMul({ a, b, transposeA, transposeB, bias, activation, preluActivationWeights }) {
        const outerShapeA = transposeA ? a.shape[2] : a.shape[1];
        const outerShapeB = transposeB ? b.shape[1] : b.shape[2];
        const batch = Math.max(a.shape[0], b.shape[0]);
        const dtype = upcastType(a.dtype, b.dtype);
        const hasBias = bias != null;
        const hasPreluActivationWeights = preluActivationWeights != null;
        const fusedActivation = activation ? mapActivationToShaderProgram(activation, true) : null;
        const program = new MatMulPackedProgram(a.shape, b.shape, [batch, outerShapeA, outerShapeB], transposeA, transposeB, hasBias, fusedActivation, hasPreluActivationWeights);
        const inputs = [a, b];
        if (bias) {
            inputs.push(bias);
        }
        if (preluActivationWeights) {
            inputs.push(preluActivationWeights);
        }
        return this.compileAndRun(program, inputs, dtype);
    }
    localResponseNormalization4D(x, radius, bias, alpha, beta) {
        const program = env().getBool('WEBGL_PACK_NORMALIZATION') ?
            new LRNPackedProgram(x.shape, radius, bias, alpha, beta) :
            new LRNProgram(x.shape, radius, bias, alpha, beta);
        return this.compileAndRun(program, [x]);
    }
    LRNGrad(dy, inputImage, outputImage, depthRadius, bias, alpha, beta) {
        const program = new LRNGradProgram(inputImage.shape, depthRadius, bias, alpha, beta);
        return this.compileAndRun(program, [inputImage, outputImage, dy]);
    }
    tile(x, reps) {
        if (x.dtype === 'string') {
            const data = this.readSync(x.dataId);
            const decodedData = data.map(d => util.decodeString(d));
            const buf = buffer(x.shape, x.dtype, decodedData);
            return tile(buf, reps);
        }
        const program = new TileProgram(x.shape, reps);
        return this.compileAndRun(program, [x]);
    }
    pad(x, paddings, constantValue) {
        const program = env().getBool('WEBGL_PACK_ARRAY_OPERATIONS') ?
            new PadPackedProgram(x.shape, paddings, constantValue) :
            new PadProgram(x.shape, paddings, constantValue);
        return this.compileAndRun(program, [x]);
    }
    gather(x, indices, axis) {
        const cpuRes = this.tryRunOnCpuOrThrow([x, indices], () => this.cpuBackend.gather(x, indices, axis));
        if (cpuRes) {
            return cpuRes;
        }
        const program = new GatherProgram(x.shape, indices.size, axis);
        return this.compileAndRun(program, [x, indices]);
    }
    batchToSpaceND(x, blockShape, crops) {
        util.assert(x.rank <= 4, () => 'batchToSpaceND for rank > 4 with a WebGL backend not ' +
            'implemented yet');
        const prod = blockShape.reduce((a, b) => a * b);
        const reshaped = backend_util.getReshaped(x.shape, blockShape, prod);
        const permuted = backend_util.getPermuted(reshaped.length, blockShape.length);
        const reshapedPermuted = backend_util.getReshapedPermuted(x.shape, blockShape, prod);
        const sliceBeginCoords = backend_util.getSliceBeginCoords(crops, blockShape.length);
        const sliceSize = backend_util.getSliceSize(reshapedPermuted, crops, blockShape.length);
        return transpose(x.reshape(reshaped), permuted)
            .reshape(reshapedPermuted)
            .slice(sliceBeginCoords, sliceSize);
    }
    spaceToBatchND(x, blockShape, paddings) {
        util.assert(x.rank <= 4, () => 'spaceToBatchND for rank > 4 with a WebGL backend not ' +
            'implemented yet');
        const prod = blockShape.reduce((a, b) => a * b);
        const completePaddings = [[0, 0]];
        completePaddings.push(...paddings);
        for (let i = 1 + blockShape.length; i < x.shape.length; ++i) {
            completePaddings.push([0, 0]);
        }
        const paddedX = x.pad(completePaddings);
        const reshapedPaddedShape = backend_util.getReshaped(paddedX.shape, blockShape, prod, false);
        const permutedReshapedPaddedPermutation = backend_util.getPermuted(reshapedPaddedShape.length, blockShape.length, false);
        const flattenShape = backend_util.getReshapedPermuted(paddedX.shape, blockShape, prod, false);
        const paddedXT = transpose(paddedX.reshape(reshapedPaddedShape), permutedReshapedPaddedPermutation);
        return reshape(paddedXT, flattenShape);
    }
    reduce(x, reduceType, dtype) {
        const batchSize = x.shape[0];
        const inSize = x.shape[1];
        const windowSize = backend_util.computeOptimalWindowSize(inSize);
        const outSize = Math.ceil(inSize / windowSize);
        const reduceInfo = { windowSize, inSize, batchSize, outSize };
        const program = new ReduceProgram(reduceInfo, reduceType);
        const output = this.compileAndRun(program, [x], dtype);
        // No need to run another GPGPU program.
        if (output.shape[1] === 1) {
            return output;
        }
        return this.reduce(output, reduceType, dtype);
    }
    argReduce(x, reduceType, bestIndicesA = null) {
        let batchSize = x.shape[0];
        let inSize = x.shape[1];
        if (bestIndicesA != null) {
            batchSize = bestIndicesA.shape[0];
            inSize = bestIndicesA.shape[1];
        }
        const windowSize = backend_util.computeOptimalWindowSize(inSize);
        const reduceInfo = {
            windowSize,
            inSize,
            batchSize,
            outSize: Math.ceil(inSize / windowSize)
        };
        const program = new ArgMinMaxProgram(reduceInfo, reduceType, bestIndicesA == null);
        const inputs = [x];
        if (bestIndicesA != null) {
            inputs.push(bestIndicesA);
        }
        const output = this.compileAndRun(program, inputs, 'int32');
        // No need to run another GPGPU program.
        if (output.shape[1] === 1) {
            return output;
        }
        return this.argReduce(x, reduceType, output);
    }
    argReducePacked(x, reduceType, bestIndicesA = null) {
        const inShape = bestIndicesA != null ? bestIndicesA.shape : x.shape;
        const inSize = inShape[inShape.length - 1];
        const windowSize = backend_util.computeOptimalWindowSize(inSize);
        const program = new ArgMinMaxPackedProgram(inShape, windowSize, reduceType, bestIndicesA == null);
        const inputs = bestIndicesA == null ? [x] : [x, bestIndicesA];
        const output = this.compileAndRun(program, inputs, 'int32');
        if (output.rank === x.rank) {
            return this.argReducePacked(x, reduceType, output);
        }
        return output;
    }
    sum(x, axes) {
        backend_util.assertAxesAreInnerMostDims('sum', axes, x.rank);
        const [outShape, reduceShape] = backend_util.computeOutAndReduceShapes(x.shape, axes);
        const inSize = util.sizeFromShape(reduceShape);
        const a2D = x.as2D(-1, inSize);
        const outputDType = tf.sumOutType(x.dtype);
        return this.reduce(a2D, 'sum', outputDType).reshape(outShape);
    }
    prod(x, axes) {
        const cpuRes = this.tryRunOnCpuOrThrow([x], () => this.cpuBackend.prod(x, axes));
        if (cpuRes) {
            return cpuRes;
        }
        const [outShape, reduceShape] = backend_util.computeOutAndReduceShapes(x.shape, axes);
        const inSize = util.sizeFromShape(reduceShape);
        const a2D = x.as2D(-1, inSize);
        const outputDType = tf.sumOutType(x.dtype);
        return this.reduce(a2D, 'prod', outputDType).reshape(outShape);
    }
    unsortedSegmentSum(x, segmentIds, numSegments) {
        let axis = 0;
        const permutation = backend_util.getAxesPermutation([axis], x.rank);
        let permutedX = x;
        if (permutation != null) {
            permutedX = transpose(x, permutation);
            axis = backend_util.getInnerMostAxes(1, x.rank)[0];
        }
        const outShape = segment_util.computeOutShape(permutedX.shape, axis, numSegments);
        const inSize = util.sizeFromShape([permutedX.shape[axis]]);
        const a2D = permutedX.as2D(-1, inSize);
        const outputDType = tf.sumOutType(x.dtype);
        let result = this.segOpCompute(a2D, 'unsortedSegmentSum', segmentIds, outputDType, numSegments)
            .reshape(outShape);
        if (permutation != null) {
            result =
                transpose(result, backend_util.getUndoAxesPermutation(permutation));
        }
        return result;
    }
    segOpCompute(x, segOpType, segmentIds, dtype, numSegments) {
        const batchSize = x.shape[0];
        const inSize = x.shape[1];
        const windowSize = segment_util.segOpComputeOptimalWindowSize(inSize, numSegments);
        const segOpInfo = { windowSize, inSize, batchSize, numSegments };
        const program = new SegmentOpProgram(segOpInfo, segOpType);
        const output = this.compileAndRun(program, [x, segmentIds], dtype);
        // No need to run another GPGPU program.
        if (output.shape[1] === numSegments) {
            return output;
        }
        segmentIds = range(0, numSegments).tile([inSize / windowSize]);
        return this.segOpCompute(output, segOpType, segmentIds, dtype, numSegments);
    }
    argMinMaxReduce(x, axis, reduceType) {
        const axes = [axis];
        backend_util.assertAxesAreInnerMostDims('arg' + reduceType.charAt(0).toUpperCase() + reduceType.slice(1), axes, x.rank);
        if (!env().getBool('WEBGL_PACK_REDUCE') || x.rank <= 2) {
            const [outShape, reduceShape] = backend_util.computeOutAndReduceShapes(x.shape, axes);
            const inSize = util.sizeFromShape(reduceShape);
            const a2D = x.as2D(-1, inSize);
            return this.argReduce(a2D, reduceType).reshape(outShape);
        }
        return this.argReducePacked(x, reduceType);
    }
    argMin(x, axis) {
        return this.argMinMaxReduce(x, axis, 'min');
    }
    argMax(x, axis) {
        return this.argMinMaxReduce(x, axis, 'max');
    }
    cumsum(x, axis, exclusive, reverse) {
        if (axis !== x.rank - 1) {
            throw new Error(`WebGL cumsum shader expects an inner-most axis=${x.rank - 1} ` +
                `but got axis=${axis}`);
        }
        const size = x.shape[axis];
        let result = x;
        // Use cumsum parallel algorithm, ref:
        // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
        for (let i = 0; i <= Math.ceil(Math.log2(size)) - 1; i++) {
            const program = new CumSumProgram(x.shape, false, reverse);
            const customSetup = program.getCustomSetupFunc(i);
            const prevResult = result;
            result = this.compileAndRun(program, [result], result.dtype, customSetup);
            prevResult.dispose();
        }
        // For exclusive cumsum, shift the end result in the direction of sum and
        // add 0 to the front index.
        if (exclusive) {
            const program = new CumSumProgram(x.shape, exclusive, reverse);
            const prevResult = result;
            result = this.compileAndRun(program, [result]);
            prevResult.dispose();
        }
        return result;
    }
    equal(a, b) {
        if (env().getBool('WEBGL_PACK_BINARY_OPERATIONS')) {
            return this.packedBinaryOp(a, b, binaryop_packed_gpu.EQUAL, 'bool');
        }
        const program = new BinaryOpProgram(binaryop_gpu.EQUAL, a.shape, b.shape);
        return this.compileAndRun(program, [a, b], 'bool');
    }
    less(a, b) {
        const cpuRes = this.tryRunOnCpuOrThrow([a, b], () => this.cpuBackend.less(a, b));
        if (cpuRes) {
            return cpuRes;
        }
        if (env().getBool('WEBGL_PACK_BINARY_OPERATIONS')) {
            return this.packedBinaryOp(a, b, binaryop_packed_gpu.LESS, 'bool');
        }
        const program = new BinaryOpProgram(binaryop_gpu.LESS, a.shape, b.shape);
        return this.compileAndRun(program, [a, b], 'bool');
    }
    lessEqual(a, b) {
        if (env().getBool('WEBGL_PACK_BINARY_OPERATIONS')) {
            return this.packedBinaryOp(a, b, binaryop_packed_gpu.LESS_EQUAL, 'bool');
        }
        const program = new BinaryOpProgram(binaryop_gpu.LESS_EQUAL, a.shape, b.shape);
        return this.compileAndRun(program, [a, b], 'bool');
    }
    greater(a, b) {
        const cpuRes = this.tryRunOnCpuOrThrow([a, b], () => this.cpuBackend.greater(a, b));
        if (cpuRes) {
            return cpuRes;
        }
        if (env().getBool('WEBGL_PACK_BINARY_OPERATIONS')) {
            return this.packedBinaryOp(a, b, binaryop_packed_gpu.GREATER, 'bool');
        }
        const program = new BinaryOpProgram(binaryop_gpu.GREATER, a.shape, b.shape);
        return this.compileAndRun(program, [a, b], 'bool');
    }
    greaterEqual(a, b) {
        if (env().getBool('WEBGL_PACK_BINARY_OPERATIONS')) {
            return this.packedBinaryOp(a, b, binaryop_packed_gpu.GREATER_EQUAL, 'bool');
        }
        const program = new BinaryOpProgram(binaryop_gpu.GREATER_EQUAL, a.shape, b.shape);
        return this.compileAndRun(program, [a, b], 'bool');
    }
    logicalNot(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.LOGICAL_NOT);
        return this.compileAndRun(program, [x]);
    }
    logicalAnd(a, b) {
        if (env().getBool('WEBGL_PACK_BINARY_OPERATIONS')) {
            return this.packedBinaryOp(a, b, binaryop_packed_gpu.LOGICAL_AND, 'bool');
        }
        const program = new BinaryOpProgram(binaryop_gpu.LOGICAL_AND, a.shape, b.shape);
        return this.compileAndRun(program, [a, b], 'bool');
    }
    logicalOr(a, b) {
        if (env().getBool('WEBGL_PACK_BINARY_OPERATIONS')) {
            return this.packedBinaryOp(a, b, binaryop_packed_gpu.LOGICAL_OR, 'bool');
        }
        const program = new BinaryOpProgram(binaryop_gpu.LOGICAL_OR, a.shape, b.shape);
        return this.compileAndRun(program, [a, b], 'bool');
    }
    select(condition, a, b) {
        const program = new SelectProgram(condition.rank, a.shape, a.rank);
        return this.compileAndRun(program, [condition, a, b], upcastType(a.dtype, b.dtype));
    }
    where(condition) {
        backend_util.warn('tf.where() in webgl locks the UI thread. ' +
            'Call tf.whereAsync() instead');
        const condVals = condition.dataSync();
        return whereImpl(condition.shape, condVals);
    }
    topk(x, k, sorted) {
        const xVals = x.dataSync();
        return topkImpl(xVals, x.shape, x.dtype, k, sorted);
    }
    min(x, axes) {
        backend_util.assertAxesAreInnerMostDims('min', axes, x.rank);
        const [outShape, reduceShape] = backend_util.computeOutAndReduceShapes(x.shape, axes);
        const inSize = util.sizeFromShape(reduceShape);
        const a2D = x.as2D(-1, inSize);
        return this.reduce(a2D, 'min', a2D.dtype).reshape(outShape);
    }
    minimum(a, b) {
        const cpuRes = this.tryRunOnCpuOrThrow([a, b], () => this.cpuBackend.minimum(a, b));
        if (cpuRes) {
            return cpuRes;
        }
        const program = env().getBool('WEBGL_PACK_BINARY_OPERATIONS') ?
            new BinaryOpPackedProgram(binaryop_packed_gpu.MIN, a.shape, b.shape) :
            new BinaryOpProgram(binaryop_gpu.MIN, a.shape, b.shape);
        return this.compileAndRun(program, [a, b]);
    }
    mod(a, b) {
        const program = env().getBool('WEBGL_PACK_BINARY_OPERATIONS') ?
            new BinaryOpPackedProgram(binaryop_packed_gpu.MOD, a.shape, b.shape) :
            new BinaryOpProgram(binaryop_gpu.MOD, a.shape, b.shape);
        return this.compileAndRun(program, [a, b]);
    }
    maximum(a, b) {
        const cpuRes = this.tryRunOnCpuOrThrow([a, b], () => this.cpuBackend.maximum(a, b));
        if (cpuRes) {
            return cpuRes;
        }
        const program = env().getBool('WEBGL_PACK_BINARY_OPERATIONS') ?
            new BinaryOpPackedProgram(binaryop_packed_gpu.MAX, a.shape, b.shape) :
            new BinaryOpProgram(binaryop_gpu.MAX, a.shape, b.shape);
        return this.compileAndRun(program, [a, b]);
    }
    all(x, axes) {
        backend_util.assertAxesAreInnerMostDims('all', axes, x.rank);
        const [outShape, reduceShape] = backend_util.computeOutAndReduceShapes(x.shape, axes);
        const inSize = util.sizeFromShape(reduceShape);
        const a2D = x.as2D(-1, inSize);
        return this.reduce(a2D, 'all', a2D.dtype).reshape(outShape);
    }
    any(x, axes) {
        backend_util.assertAxesAreInnerMostDims('any', axes, x.rank);
        const [outShape, reduceShape] = backend_util.computeOutAndReduceShapes(x.shape, axes);
        const inSize = util.sizeFromShape(reduceShape);
        const a2D = x.as2D(-1, inSize);
        return this.reduce(a2D, 'any', a2D.dtype).reshape(outShape);
    }
    floorDiv(a, b) {
        const op = binaryop_gpu.INT_DIV;
        const outputDtype = 'int32';
        if (env().getBool('WEBGL_PACK_BINARY_OPERATIONS')) {
            return this.packedBinaryOp(a, b, binaryop_packed_gpu.INT_DIV, outputDtype);
        }
        const program = new BinaryOpProgram(op, a.shape, b.shape);
        return this.compileAndRun(program, [a, b], outputDtype);
    }
    packedUnaryOp(x, op, dtype) {
        const program = new UnaryOpPackedProgram(x.shape, op);
        return this.compileAndRun(program, [x], dtype);
    }
    packedBinaryOp(a, b, op, dtype, checkOutOfBounds = false) {
        const program = new BinaryOpPackedProgram(op, a.shape, b.shape, checkOutOfBounds);
        return this.compileAndRun(program, [a, b], dtype);
    }
    // Returns a TensorInfo with the complex shape and the dataId of the
    // underlying part. We need to do this because a reshaped complex tensor is
    // not reflected in its parts.
    makeComplexComponentTensorInfo(complexTensor, complexPart) {
        return {
            dataId: complexPart.dataId,
            dtype: complexPart.dtype,
            shape: complexTensor.shape
        };
    }
    addN(tensors) {
        if (tensors.length === 1) {
            return tensors[0];
        }
        // Limit the number of uploaded textures for optimization.
        if (tensors.length > env().get('WEBGL_MAX_TEXTURES_IN_SHADER')) {
            const midIndex = Math.floor(tensors.length / 2);
            const leftSide = this.addN(tensors.slice(0, midIndex));
            const rightSide = this.addN(tensors.slice(midIndex));
            return this.addN([leftSide, rightSide]);
        }
        const dtype = tensors.map(t => t.dtype).reduce((d1, d2) => upcastType(d1, d2));
        const shapes = tensors.map(t => t.shape);
        // We can make sure shapes are identical in op level.
        const usePackedOp = env().getBool('WEBGL_PACK');
        const program = usePackedOp ?
            new AddNPackedProgram(tensors[0].shape, shapes) :
            new AddNProgram(tensors[0].shape, shapes);
        return this.compileAndRun(program, tensors, dtype);
    }
    pow(a, b) {
        const usePackedOp = env().getBool('WEBGL_PACK_BINARY_OPERATIONS');
        const program = usePackedOp ?
            new BinaryOpPackedProgram(binaryop_packed_gpu.POW, a.shape, b.shape) :
            new BinaryOpProgram(binaryop_gpu.POW, a.shape, b.shape);
        const dtype = upcastType(a.dtype, b.dtype);
        return this.compileAndRun(program, [a, b], dtype);
    }
    ceil(x) {
        if (this.shouldExecuteOnCPU([x])) {
            const outValues = ceilImplCPU(this.texData.get(x.dataId).values, x.dtype);
            return this.makeOutput(x.shape, x.dtype, outValues);
        }
        if (env().getBool('WEBGL_PACK_UNARY_OPERATIONS')) {
            return this.packedUnaryOp(x, unary_op.CEIL, x.dtype);
        }
        const program = new UnaryOpProgram(x.shape, unary_op.CEIL);
        return this.compileAndRun(program, [x]);
    }
    floor(x) {
        if (this.shouldExecuteOnCPU([x])) {
            const outValues = floorImplCPU(this.texData.get(x.dataId).values, x.dtype);
            return this.makeOutput(x.shape, x.dtype, outValues);
        }
        if (env().getBool('WEBGL_PACK_UNARY_OPERATIONS')) {
            return this.packedUnaryOp(x, unary_op.FLOOR, x.dtype);
        }
        const program = new UnaryOpProgram(x.shape, unary_op.FLOOR);
        return this.compileAndRun(program, [x]);
    }
    sign(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.SIGN);
        return this.compileAndRun(program, [x]);
    }
    isNaN(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.IS_NAN);
        return this.compileAndRun(program, [x], 'bool');
    }
    isInf(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.IS_INF);
        return this.compileAndRun(program, [x], 'bool');
    }
    isFinite(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.IS_FINITE);
        return this.compileAndRun(program, [x], 'bool');
    }
    round(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.ROUND);
        return this.compileAndRun(program, [x]);
    }
    exp(x) {
        if (this.shouldExecuteOnCPU([x])) {
            const outValues = expImplCPU(this.texData.get(x.dataId).values, x.dtype);
            return this.makeOutput(x.shape, x.dtype, outValues);
        }
        if (env().getBool('WEBGL_PACK_UNARY_OPERATIONS')) {
            return this.packedUnaryOp(x, unary_op.EXP, x.dtype);
        }
        const program = new UnaryOpProgram(x.shape, unary_op.EXP);
        return this.compileAndRun(program, [x]);
    }
    expm1(x) {
        if (this.shouldExecuteOnCPU([x])) {
            const outValues = expm1ImplCPU(this.texData.get(x.dataId).values, x.dtype);
            return this.makeOutput(x.shape, x.dtype, outValues);
        }
        if (env().getBool('WEBGL_PACK_UNARY_OPERATIONS')) {
            return this.packedUnaryOp(x, unary_op.EXPM1, x.dtype);
        }
        const program = new UnaryOpProgram(x.shape, unary_op.EXPM1);
        return this.compileAndRun(program, [x]);
    }
    softmax(logits, dim) {
        const axes = util.parseAxisParam([dim], logits.shape);
        // TODO(annxingyuan): Call maxImpl rather than op as part of softmax kernel
        // modularization.
        const maxLogit = max(logits, axes);
        const expandedShape = backend_util.expandShapeToKeepDim(maxLogit.shape, axes);
        // TODO(annxingyuan): Call sub directly as part of softmax kernel
        // modularization.
        const a = tf.sub(logits, maxLogit.reshape(expandedShape));
        const b = this.exp(a);
        const sumExp = this.sum(b, axes).reshape(expandedShape);
        // TODO(annxingyuan): Call divImpl rather than op as part of softmax kernel
        // modularization.
        return div(b, sumExp);
    }
    log(x) {
        if (this.shouldExecuteOnCPU([x])) {
            const outValues = logImplCPU(this.texData.get(x.dataId).values, x.dtype);
            return this.makeOutput(x.shape, x.dtype, outValues);
        }
        if (env().getBool('WEBGL_PACK_UNARY_OPERATIONS')) {
            return this.packedUnaryOp(x, unary_packed_op.LOG, x.dtype);
        }
        const program = new UnaryOpProgram(x.shape, unary_op.LOG);
        return this.compileAndRun(program, [x]);
    }
    log1p(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.LOG1P);
        return this.compileAndRun(program, [x]);
    }
    sqrt(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.SQRT);
        return this.compileAndRun(program, [x]);
    }
    rsqrt(x) {
        if (this.shouldExecuteOnCPU([x])) {
            const outValues = rsqrtImplCPU(this.texData.get(x.dataId).values, x.dtype);
            return this.makeOutput(x.shape, x.dtype, outValues);
        }
        const program = new UnaryOpProgram(x.shape, unary_op.RSQRT);
        return this.compileAndRun(program, [x]);
    }
    reciprocal(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.RECIPROCAL);
        return this.compileAndRun(program, [x]);
    }
    relu(x) {
        let program;
        if (env().getBool('WEBGL_PACK')) {
            program = new UnaryOpPackedProgram(x.shape, unary_packed_op.RELU);
        }
        else {
            program = new UnaryOpProgram(x.shape, unary_op.RELU);
        }
        return this.compileAndRun(program, [x]);
    }
    relu6(x) {
        let program;
        if (env().getBool('WEBGL_PACK')) {
            program = new UnaryOpPackedProgram(x.shape, unary_packed_op.RELU6);
        }
        else {
            program = new UnaryOpProgram(x.shape, unary_op.RELU6);
        }
        return this.compileAndRun(program, [x]);
    }
    prelu(x, alpha) {
        const program = env().getBool('WEBGL_PACK_BINARY_OPERATIONS') ?
            new BinaryOpPackedProgram(binaryop_packed_gpu.PRELU, x.shape, alpha.shape) :
            new BinaryOpProgram(binaryop_gpu.PRELU, x.shape, alpha.shape);
        return this.compileAndRun(program, [x, alpha]);
    }
    elu(x) {
        if (env().getBool('WEBGL_PACK_UNARY_OPERATIONS')) {
            return this.packedUnaryOp(x, unary_packed_op.ELU, x.dtype);
        }
        const program = new UnaryOpProgram(x.shape, unary_op.ELU);
        return this.compileAndRun(program, [x]);
    }
    eluDer(dy, y) {
        const program = env().getBool('WEBGL_PACK_BINARY_OPERATIONS') ?
            new BinaryOpPackedProgram(binaryop_packed_gpu.ELU_DER, dy.shape, y.shape) :
            new BinaryOpProgram(binaryop_gpu.ELU_DER, dy.shape, y.shape);
        return this.compileAndRun(program, [dy, y]);
    }
    selu(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.SELU);
        return this.compileAndRun(program, [x]);
    }
    clip(x, min, max) {
        let program;
        if (env().getBool('WEBGL_PACK_CLIP')) {
            program = new ClipPackedProgram(x.shape);
        }
        else {
            program = new ClipProgram(x.shape);
        }
        const customSetup = program.getCustomSetupFunc(min, max);
        return this.compileAndRun(program, [x], null, customSetup);
    }
    abs(x) {
        // TODO: handle cases when x is complex.
        if (this.shouldExecuteOnCPU([x]) && x.dtype !== 'complex64') {
            const outValues = simpleAbsImplCPU(this.texData.get(x.dataId).values);
            return this.makeOutput(x.shape, x.dtype, outValues);
        }
        if (env().getBool('WEBGL_PACK_UNARY_OPERATIONS')) {
            return this.packedUnaryOp(x, unary_op.ABS, x.dtype);
        }
        const program = new UnaryOpProgram(x.shape, unary_op.ABS);
        return this.compileAndRun(program, [x]);
    }
    complexAbs(x) {
        const xData = this.texData.get(x.dataId);
        const program = new ComplexAbsProgram(x.shape);
        const inputs = [
            this.makeComplexComponentTensorInfo(x, xData.complexTensorInfos.real),
            this.makeComplexComponentTensorInfo(x, xData.complexTensorInfos.imag),
        ];
        return this.compileAndRun(program, inputs);
    }
    sigmoid(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.SIGMOID);
        return this.compileAndRun(program, [x]);
    }
    softplus(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.SOFTPLUS);
        return this.compileAndRun(program, [x]);
    }
    asin(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.ASIN);
        return this.compileAndRun(program, [x]);
    }
    acos(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.ACOS);
        return this.compileAndRun(program, [x]);
    }
    atan(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.ATAN);
        return this.compileAndRun(program, [x]);
    }
    sinh(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.SINH);
        return this.compileAndRun(program, [x]);
    }
    cosh(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.COSH);
        return this.compileAndRun(program, [x]);
    }
    tanh(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.TANH);
        return this.compileAndRun(program, [x]);
    }
    asinh(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.ASINH);
        return this.compileAndRun(program, [x]);
    }
    acosh(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.ACOSH);
        return this.compileAndRun(program, [x]);
    }
    atanh(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.ATANH);
        return this.compileAndRun(program, [x]);
    }
    erf(x) {
        const program = new UnaryOpProgram(x.shape, unary_op.ERF);
        return this.compileAndRun(program, [x]);
    }
    step(x, alpha) {
        const program = new UnaryOpProgram(x.shape, unary_op.STEP(alpha));
        return this.compileAndRun(program, [x]);
    }
    conv2dByMatMul(x, filter, convInfo, bias, activation, preluActivationWeights) {
        // Reshapes conv2D input to 2D tensors, uses matMul and then reshape the
        // result from 2D to 4D.
        const xShape = x.shape;
        const xTexData = this.texData.get(x.dataId);
        const sharedMatMulDim = convInfo.inChannels;
        const outerShapeX = xShape[0] * xShape[1] * xShape[2];
        const outerShapeFilter = convInfo.outChannels;
        const isChannelsLast = convInfo.dataFormat === 'channelsLast';
        const transposeA = false;
        const transposeB = false;
        // TODO: Once reduction ops are packed, batchMatMul will always be packed
        // and we can remove this condition.
        const batchMatMulWillBeUnpacked = (outerShapeX === 1 || outerShapeFilter === 1) &&
            sharedMatMulDim > MATMUL_SHARED_DIM_THRESHOLD;
        const reshapeWillBeExpensive = xShape[2] % 2 !== 0 && !!xTexData.isPacked;
        if (batchMatMulWillBeUnpacked || !env().getBool('WEBGL_LAZILY_UNPACK') ||
            !env().getBool('WEBGL_PACK_BINARY_OPERATIONS') ||
            !reshapeWillBeExpensive) {
            const targetShape = isChannelsLast ? xShape[0] * xShape[1] * xShape[2] :
                xShape[0] * xShape[2] * xShape[3];
            const xReshaped = reshape(x, [1, targetShape, convInfo.inChannels]);
            const filterReshaped = reshape(filter, [1, convInfo.inChannels, convInfo.outChannels]);
            const result = this.fusedBatchMatMul({
                a: xReshaped,
                b: filterReshaped,
                transposeA,
                transposeB,
                bias,
                activation,
                preluActivationWeights
            });
            return reshape(result, convInfo.outShape);
        }
        // Following optimization is specific to packed |x| with odd row count
        // (For example, in channelLast mode, 'row count' refers to x.shape[2]):
        // we avoid expensive packed 2x2 reshape by padding row count to next,
        // even number. When x.shape[2] is odd, the result of packed batchMatMul is
        // the same (has the same texture layout and and values in the texture) as
        // it is for even x.shape[2] + 1. We make the odd-rows tensor to look like
        // even-rows tensor before the operation and, after the batchMatMul,
        // fix the even-rows result to have odd number of rows.
        const targetShape = isChannelsLast ?
            xShape[0] * xShape[1] * (xShape[2] + 1) :
            xShape[0] * xShape[2] * (xShape[3] + 1);
        const xReshaped = {
            dataId: x.dataId,
            shape: [1, targetShape, convInfo.inChannels],
            dtype: x.dtype
        };
        // xTexData.shape gets referenced from GPGPUBinary.inShapeInfos.
        // Decrementing row count, after batchMatMul->...->compileProgram leads to
        // invalid row count within the reference in GPGPUBinary.inShapeInfos.
        // Alternative fix would be to provide a copy to GPGPUBinary.inShapeInfos
        // in compileProgram method, but that would affect compilation of all
        // programs - instead, provide a copy here, with even row count, before
        // calling batchMatMul->...->compileProgram and after that, the original
        // xTexData.shape is restored.
        const originalXTexDataShape = xTexData.shape;
        xTexData.shape = xTexData.shape.slice();
        xTexData.shape[xTexData.shape.length - 2]++;
        util.assert(webgl_util.isReshapeFree(xTexData.shape, xReshaped.shape), () => `packed reshape ${xTexData.shape} to ${xReshaped.shape} isn't free`);
        const filterReshaped = reshape(filter, [1, convInfo.inChannels, convInfo.outChannels]);
        const pointwiseConv = this.fusedBatchMatMul({
            a: xReshaped,
            b: filterReshaped,
            transposeA,
            transposeB,
            bias,
            activation,
            preluActivationWeights
        });
        const pointwiseConvTexData = this.texData.get(pointwiseConv.dataId);
        util.assert(pointwiseConvTexData.isPacked, () => 'batchMatMul result is expected to be packed');
        // Restore the input shape to original.
        xTexData.shape = originalXTexDataShape;
        // Set the output shape - there is no need for expensive reshape as data
        // layout is already correct.
        pointwiseConvTexData.shape = convInfo.outShape;
        return engine().makeTensorFromDataId(pointwiseConv.dataId, convInfo.outShape, pointwiseConv.dtype);
    }
    conv2dWithIm2Row(x, filter, convInfo, bias, activation, preluActivationWeights) {
        // Rearranges conv2d input so each block to be convolved over forms the
        // column of a new matrix with shape [filterWidth * filterHeight *
        // inChannels, outHeight * outWidth]. The filter is also rearranged so each
        // output channel forms a row of a new matrix with shape [outChannels,
        // filterWidth * filterHeight * inChannels]. The convolution is then
        // computed by multiplying these matrices and reshaping the result.
        const { filterWidth, filterHeight, inChannels, outWidth, outHeight, dataFormat } = convInfo;
        const isChannelsLast = dataFormat === 'channelsLast';
        const sharedDim = filterWidth * filterHeight * inChannels;
        const numCols = outHeight * outWidth;
        const x2ColShape = [sharedDim, numCols];
        const transposeA = true;
        const transposeB = false;
        const xSqueezed = x.squeeze([0]);
        const w2Row = filter.reshape([1, sharedDim, -1]);
        const im2ColProgram = new Im2ColPackedProgram(x2ColShape, xSqueezed.shape, convInfo);
        const im2Col = this.compileAndRun(im2ColProgram, [xSqueezed]).reshape([
            1, x2ColShape[0], x2ColShape[1]
        ]);
        const hasBias = bias != null;
        const hasPreluActivationWeights = preluActivationWeights != null;
        const fusedActivation = activation ? mapActivationToShaderProgram(activation, true) : null;
        const matmulProgram = new MatMulPackedProgram(im2Col.shape, w2Row.shape, [1, numCols, convInfo.outChannels], transposeA, transposeB, hasBias, fusedActivation, hasPreluActivationWeights);
        const inputs = [im2Col, w2Row];
        if (bias) {
            inputs.push(bias);
        }
        if (hasPreluActivationWeights) {
            inputs.push(preluActivationWeights);
        }
        const product = this.compileAndRun(matmulProgram, inputs);
        if (isChannelsLast) {
            return product.reshape([1, outHeight, outWidth, convInfo.outChannels]);
        }
        else {
            return product.reshape([1, convInfo.outChannels, outHeight, outWidth]);
        }
    }
    fusedConv2d({ input, filter, convInfo, bias, activation, preluActivationWeights }) {
        if (convInfo.filterHeight === 1 && convInfo.filterWidth === 1 &&
            convInfo.dilationHeight === 1 && convInfo.dilationWidth === 1 &&
            convInfo.strideHeight === 1 && convInfo.strideWidth === 1 &&
            (convInfo.padInfo.type === 'SAME' ||
                convInfo.padInfo.type === 'VALID')) {
            return this.conv2dByMatMul(input, filter, convInfo, bias, activation, preluActivationWeights);
        }
        if (env().getBool('WEBGL_CONV_IM2COL') && input.shape[0] === 1) {
            return this.conv2dWithIm2Row(input, filter, convInfo, bias, activation, preluActivationWeights);
        }
        const hasBias = bias != null;
        const hasPreluActivationWeights = preluActivationWeights != null;
        const fusedActivation = activation ? mapActivationToShaderProgram(activation, false) : null;
        const program = new Conv2DProgram(convInfo, hasBias, fusedActivation, hasPreluActivationWeights);
        const inputs = [input, filter];
        if (bias) {
            inputs.push(bias);
        }
        if (preluActivationWeights) {
            inputs.push(preluActivationWeights);
        }
        return this.compileAndRun(program, inputs);
    }
    conv2d(x, filter, convInfo) {
        if (convInfo.filterHeight === 1 && convInfo.filterWidth === 1 &&
            convInfo.dilationHeight === 1 && convInfo.dilationWidth === 1 &&
            convInfo.strideHeight === 1 && convInfo.strideWidth === 1 &&
            (convInfo.padInfo.type === 'SAME' ||
                convInfo.padInfo.type === 'VALID')) {
            return this.conv2dByMatMul(x, filter, convInfo);
        }
        if (env().getBool('WEBGL_CONV_IM2COL') && x.shape[0] === 1) {
            return this.conv2dWithIm2Row(x, filter, convInfo);
        }
        const program = new Conv2DProgram(convInfo);
        return this.compileAndRun(program, [x, filter]);
    }
    conv2dDerInput(dy, filter, convInfo) {
        const program = new Conv2DDerInputProgram(convInfo);
        return this.compileAndRun(program, [dy, filter]);
    }
    conv2dDerFilter(x, dy, convInfo) {
        const program = new Conv2DDerFilterProgram(convInfo);
        return this.compileAndRun(program, [x, dy]);
    }
    fusedDepthwiseConv2D({ input, filter, convInfo, bias, activation, preluActivationWeights }) {
        const shouldPackDepthwiseConv = env().getBool('WEBGL_PACK_DEPTHWISECONV') &&
            convInfo.strideWidth <= 2 &&
            convInfo.outChannels / convInfo.inChannels === 1;
        const fusedActivation = activation ?
            mapActivationToShaderProgram(activation, shouldPackDepthwiseConv) :
            null;
        const inputs = [input, filter];
        const hasBias = bias != null;
        const hasPreluActivationWeights = preluActivationWeights != null;
        if (hasBias) {
            inputs.push(bias);
        }
        if (hasPreluActivationWeights) {
            inputs.push(preluActivationWeights);
        }
        let program;
        if (shouldPackDepthwiseConv) {
            program = new DepthwiseConvPacked2DProgram(convInfo, hasBias, fusedActivation, hasPreluActivationWeights);
            return this.compileAndRun(program, inputs);
        }
        program = new DepthwiseConv2DProgram(convInfo, hasBias, fusedActivation, hasPreluActivationWeights);
        return this.compileAndRun(program, inputs);
    }
    depthwiseConv2D(x, filter, convInfo) {
        let program;
        if (env().getBool('WEBGL_PACK_DEPTHWISECONV') &&
            convInfo.strideWidth <= 2 &&
            convInfo.outChannels / convInfo.inChannels === 1) {
            program = new DepthwiseConvPacked2DProgram(convInfo);
            return this.compileAndRun(program, [x, filter]);
        }
        program = new DepthwiseConv2DProgram(convInfo);
        return this.compileAndRun(program, [x, filter]);
    }
    depthwiseConv2DDerInput(dy, filter, convInfo) {
        const program = new DepthwiseConv2DDerInputProgram(convInfo);
        return this.compileAndRun(program, [dy, filter]);
    }
    depthwiseConv2DDerFilter(x, dy, convInfo) {
        const program = new DepthwiseConv2DDerFilterProgram(convInfo);
        return this.compileAndRun(program, [x, dy]);
    }
    conv3d(x, filter, convInfo) {
        const program = new Conv3DProgram(convInfo);
        return this.compileAndRun(program, [x, filter]);
    }
    conv3dDerInput(dy, filter, convInfo) {
        const program = new Conv3DDerInputProgram(convInfo);
        return this.compileAndRun(program, [dy, filter]);
    }
    conv3dDerFilter(x, dy, convInfo) {
        const program = new Conv3DDerFilterProgram(convInfo);
        return this.compileAndRun(program, [x, dy]);
    }
    unstack(x, axis) {
        const num = x.shape[axis];
        const outShape = new Array(x.rank - 1);
        let outIndex = 0;
        for (let i = 0; i < x.rank; i++) {
            if (i !== axis) {
                outShape[outIndex++] = x.shape[i];
            }
        }
        const begin = new Array(x.rank).fill(0);
        const size = x.shape.slice();
        size[axis] = 1;
        const res = new Array(num);
        for (let i = 0; i < res.length; i++) {
            begin[axis] = i;
            res[i] = this.slice(x, begin, size).reshape(outShape);
        }
        return res;
    }
    avgPool3d(x, convInfo) {
        const program = new Pool3DProgram(convInfo, 'avg', false);
        return this.compileAndRun(program, [x], 'float32');
    }
    avgPool3dBackprop(dy, x, convInfo) {
        const avgPool3dBackpropProgram = new AvgPool3DBackpropProgram(convInfo);
        return this.compileAndRun(avgPool3dBackpropProgram, [dy], x.dtype);
    }
    maxPool3d(x, convInfo) {
        const program = new Pool3DProgram(convInfo, 'max', false);
        return this.compileAndRun(program, [x], 'float32');
    }
    maxPool3dBackprop(dy, x, y, convInfo) {
        const getPositions = true;
        const maxPool3dPositionsProgram = new Pool3DProgram(convInfo, 'max', getPositions);
        const maxPool3dPositions = this.compileAndRun(maxPool3dPositionsProgram, [x]);
        const maxPool3dBackPropProgram = new MaxPool3DBackpropProgram(convInfo);
        const result = this.compileAndRun(maxPool3dBackPropProgram, [dy, maxPool3dPositions], x.dtype);
        maxPool3dPositions.dispose();
        return result;
    }
    resizeBilinear(x, newHeight, newWidth, alignCorners) {
        const program = env().getBool('WEBGL_PACK_IMAGE_OPERATIONS') ?
            new ResizeBilinearPackedProgram(x.shape, newHeight, newWidth, alignCorners) :
            new ResizeBilinearProgram(x.shape, newHeight, newWidth, alignCorners);
        return this.compileAndRun(program, [x], 'float32');
    }
    resizeBilinearBackprop(dy, x, alignCorners) {
        const program = new ResizeBilinearBackpropProgram(dy, x, alignCorners);
        return this.compileAndRun(program, [dy]);
    }
    resizeNearestNeighbor(x, newHeight, newWidth, alignCorners) {
        const program = new ResizeNearestNeighborProgram(x.shape, newHeight, newWidth, alignCorners);
        return this.compileAndRun(program, [x]);
    }
    resizeNearestNeighborBackprop(dy, x, alignCorners) {
        const program = new ResizeNearestNeigborBackpropProgram(dy, x, alignCorners);
        return this.compileAndRun(program, [dy]);
    }
    multinomial(logits, normalized, numSamples, seed) {
        const probs = normalized ? logits : softmax(logits);
        const batchSize = probs.shape[0];
        const numOutcomes = probs.shape[1];
        const program = new MultinomialProgram(batchSize, numOutcomes, numSamples);
        const customSetup = program.getCustomSetupFunc(seed);
        return this.compileAndRun(program, [probs], 'int32', customSetup);
    }
    oneHot(indices, depth, onValue, offValue) {
        const program = new OneHotProgram(indices.size, depth, onValue, offValue);
        return this.compileAndRun(program, [indices]);
    }
    diag(x) {
        const program = new DiagProgram(x.size);
        return this.compileAndRun(program, [x]);
    }
    cropAndResize(image, boxes, boxIndex, cropSize, method, extrapolationValue) {
        const program = new CropAndResizeProgram(image.shape, boxes.shape, cropSize, method, extrapolationValue);
        return this.compileAndRun(program, [image, boxes, boxIndex], 'float32');
    }
    depthToSpace(x, blockSize, dataFormat) {
        util.assert(blockSize > 1, () => `blockSize should be > 1 for depthToSpace, but was: ${blockSize}`);
        const batchSize = x.shape[0];
        const inputHeight = (dataFormat === 'NHWC') ? x.shape[1] : x.shape[2];
        const inputWidth = (dataFormat === 'NHWC') ? x.shape[2] : x.shape[3];
        const inputDepth = (dataFormat === 'NHWC') ? x.shape[3] : x.shape[1];
        const outputHeight = inputHeight * blockSize;
        const outputWidth = inputWidth * blockSize;
        const outputDepth = inputDepth / (blockSize * blockSize);
        const outputShape = (dataFormat === 'NHWC') ?
            [batchSize, outputHeight, outputWidth, outputDepth] :
            [batchSize, outputDepth, outputHeight, outputWidth];
        const program = new DepthToSpaceProgram(outputShape, blockSize, dataFormat);
        return this.compileAndRun(program, [x]);
    }
    split(x, sizeSplits, axis) {
        return split(x, sizeSplits, axis);
    }
    scatterND(indices, updates, shape) {
        const { sliceRank, numUpdates, sliceSize, strides, outputSize } = backend_util.calculateShapes(updates, indices, shape);
        const flattenShape = [outputSize / sliceSize, sliceSize];
        const flattenIndices = indices.reshape([numUpdates, sliceRank]);
        const flattenX = updates.reshape([numUpdates, sliceSize]);
        if (outputSize === 0) {
            return backend_util.reshapeTensor(tensor([]), shape);
        }
        const defaultValue = scalar(0);
        const program = new ScatterProgram(numUpdates, sliceRank, flattenIndices.rank, flattenX.rank, strides, flattenShape);
        const res = this.compileAndRun(program, [flattenX, flattenIndices, defaultValue]);
        return res.reshape(shape);
    }
    sparseToDense(sparseIndices, sparseValues, outputShape, defaultValue) {
        const { sliceRank, numUpdates, strides, outputSize } = backend_util.calculateShapes(sparseValues, sparseIndices, outputShape);
        const sumDupeIndices = false;
        const program = new ScatterProgram(numUpdates, sliceRank, sparseIndices.rank, sparseValues.rank, strides, [outputSize, 1], sumDupeIndices);
        const res = this.compileAndRun(program, [sparseValues, sparseIndices, defaultValue]);
        return res.reshape(outputShape);
    }
    gatherND(x, indices) {
        const indicesShape = indices.shape;
        const sliceRank = indicesShape[indicesShape.length - 1];
        const [resultShape, numSlices, sliceSize, strides] = backend_util.prepareAndValidate(x, indices);
        const flattenIndices = indices.reshape([numSlices, sliceRank]);
        const flattenX = x.reshape([x.size / sliceSize, sliceSize]);
        const program = new GatherNDProgram(sliceRank, strides, [numSlices, sliceSize]);
        const res = this.compileAndRun(program, [flattenX, flattenIndices]);
        return res.reshape(resultShape);
    }
    fill(shape, value, dtype) {
        dtype = dtype || util.inferDtype(value);
        if (dtype === 'string') {
            // String type should be handled in CPU memory.
            const values = util.getArrayFromDType(dtype, util.sizeFromShape(shape));
            values.fill(value);
            return engine().makeTensor(values, shape, dtype, this);
        }
        else {
            const program = new FillProgram(shape, value);
            const customSetup = program.getCustomSetupFunc(value);
            return this.compileAndRun(program, [], dtype, customSetup);
        }
    }
    onesLike(x) {
        if (x.dtype === 'string') {
            throw new Error('onesLike is not supported under string dtype');
        }
        else {
            // TODO(cais, smilkov): Add WebGL shader for onesLike:
            //   https://github.com/tensorflow/tfjs/issues/1293
            return this.fill(x.shape, 1, x.dtype);
        }
    }
    zerosLike(x) {
        return this.fill(x.shape, x.dtype === 'string' ? '' : 0, x.dtype);
    }
    linspace(start, stop, num) {
        // TODO: Use CPU implementation due to the precision problem in Safari.
        return backend_util.linspaceImpl(start, stop, num);
    }
    makeTensorInfo(shape, dtype, values) {
        const dataId = this.write(values, shape, dtype);
        this.texData.get(dataId).usage = null;
        return { dataId, shape, dtype };
    }
    makeOutput(shape, dtype, values) {
        const { dataId } = this.makeTensorInfo(shape, dtype, values);
        return engine().makeTensorFromDataId(dataId, shape, dtype, this);
    }
    unpackTensor(input) {
        const program = new UnpackProgram(input.shape);
        return this.runWebGLProgram(program, [input], input.dtype);
    }
    packTensor(input) {
        const program = new PackProgram(input.shape);
        const preventEagerUnpackingOutput = true;
        return this.runWebGLProgram(program, [input], input.dtype, null /* customSetup */, preventEagerUnpackingOutput);
    }
    packedReshape(input, afterShape) {
        const input3DShape = [
            webgl_util.getBatchDim(input.shape),
            ...webgl_util.getRowsCols(input.shape)
        ];
        const input3D = {
            dtype: input.dtype,
            shape: input3DShape,
            dataId: input.dataId
        };
        const afterShapeAs3D = [
            webgl_util.getBatchDim(afterShape), ...webgl_util.getRowsCols(afterShape)
        ];
        const program = new ReshapePackedProgram(afterShapeAs3D, input3DShape);
        const preventEagerUnpackingOfOutput = true;
        const output = this.runWebGLProgram(program, [input3D], input.dtype, null /* customSetup */, preventEagerUnpackingOfOutput);
        return { dataId: output.dataId, shape: afterShape, dtype: output.dtype };
    }
    decode(dataId) {
        const texData = this.texData.get(dataId);
        const { isPacked, shape, dtype } = texData;
        const shapeAs3D = webgl_util.getShapeAs3D(shape);
        let program;
        if (isPacked) {
            program = new DecodeMatrixPackedProgram(shapeAs3D);
        }
        else {
            program = new DecodeMatrixProgram(shapeAs3D);
        }
        const preventEagerUnpackingOfOutput = true;
        const out = this.runWebGLProgram(program, [{ shape: shapeAs3D, dtype, dataId }], dtype, null /* customSetup */, preventEagerUnpackingOfOutput);
        return { dtype, shape, dataId: out.dataId };
    }
    runWebGLProgram(program, inputs, outputDtype, customSetup, preventEagerUnpackingOfOutput = false) {
        const output = this.makeTensorInfo(program.outputShape, outputDtype);
        const outData = this.texData.get(output.dataId);
        if (program.packedOutput) {
            outData.isPacked = true;
        }
        if (program.outPackingScheme === tex_util.PackingScheme.DENSE) {
            const texelShape = tex_util.getDenseTexShape(program.outputShape);
            // For a densely packed output, we explicitly set texShape
            // so it doesn't get assigned later according to our typical packing
            // scheme wherein a single texel can only contain values from adjacent
            // rows/cols.
            outData.texShape = texelShape.map(d => d * 2);
        }
        if (program.outTexUsage != null) {
            outData.usage = program.outTexUsage;
        }
        if (util.sizeFromShape(output.shape) === 0) {
            // Short-circuit the computation since the result is empty (has 0 in its
            // shape).
            outData.values =
                util.getTypedArrayFromDType(output.dtype, 0);
            return output;
        }
        const dataToDispose = [];
        const inputsData = inputs.map(input => {
            if (input.dtype === 'complex64') {
                throw new Error(`GPGPUProgram does not support complex64 input. For complex64 ` +
                    `dtypes, please separate the program into real and imaginary ` +
                    `parts.`);
            }
            let texData = this.texData.get(input.dataId);
            if (texData.texture == null) {
                if (!program.packedInputs &&
                    util.sizeFromShape(input.shape) <=
                        env().getNumber('WEBGL_SIZE_UPLOAD_UNIFORM')) {
                    // Upload small tensors that live on the CPU as uniforms, not as
                    // textures. Do this only when the environment supports 32bit floats
                    // due to problems when comparing 16bit floats with 32bit floats.
                    // TODO(https://github.com/tensorflow/tfjs/issues/821): Make it
                    // possible for packed shaders to sample from uniforms.
                    return {
                        shape: input.shape,
                        texData: null,
                        isUniform: true,
                        uniformValues: texData.values
                    };
                }
                // This ensures that if a packed program's inputs have not yet been
                // uploaded to the GPU, they get uploaded as packed right off the bat.
                if (program.packedInputs) {
                    texData.isPacked = true;
                    texData.shape = input.shape;
                }
            }
            else if (!!texData.isPacked !== !!program.packedInputs) {
                input = texData.isPacked ? this.unpackTensor(input) :
                    this.packTensor(input);
                dataToDispose.push(input);
                texData = this.texData.get(input.dataId);
            }
            else if (texData.isPacked &&
                !webgl_util.isReshapeFree(texData.shape, input.shape)) {
                // This is a special case where a texture exists for a tensor
                // but the shapes are incompatible (due to packing constraints) because
                // the tensor did not have a chance to go through the packed reshape
                // shader. This only happens when we reshape the *same* tensor to form
                // *distinct* inputs to an op, e.g. dotting a vector with itself. This
                // case will disappear once packed uploading is the default.
                const savedInput = input;
                const targetShape = input.shape;
                input.shape = texData.shape;
                input = this.packedReshape(input, targetShape);
                dataToDispose.push(input);
                texData = this.texData.get(input.dataId);
                savedInput.shape = targetShape;
            }
            this.uploadToGPU(input.dataId);
            return { shape: input.shape, texData, isUniform: false };
        });
        this.uploadToGPU(output.dataId);
        const outputData = { shape: output.shape, texData: outData, isUniform: false };
        const key = gpgpu_math.makeShaderKey(program, inputsData, outputData);
        const binary = this.getAndSaveBinary(key, () => {
            return gpgpu_math.compileProgram(this.gpgpu, program, inputsData, outputData);
        });
        const shouldTimeProgram = this.activeTimers != null;
        let query;
        if (shouldTimeProgram) {
            query = this.startTimer();
        }
        gpgpu_math.runProgram(this.gpgpu, binary, inputsData, outputData, customSetup);
        dataToDispose.forEach(info => this.disposeIntermediateTensorInfo(info));
        if (shouldTimeProgram) {
            query = this.endTimer(query);
            this.activeTimers.push({ name: program.constructor.name, query: this.getQueryTime(query) });
        }
        if (!env().getBool('WEBGL_LAZILY_UNPACK') && outData.isPacked &&
            preventEagerUnpackingOfOutput === false) {
            const unpacked = this.unpackTensor(output);
            this.disposeIntermediateTensorInfo(output);
            return unpacked;
        }
        return output;
    }
    compileAndRun(program, inputs, outputDtype, customSetup, preventEagerUnpackingOfOutput = false) {
        outputDtype = outputDtype || inputs[0].dtype;
        const outInfo = this.runWebGLProgram(program, inputs, outputDtype, customSetup, preventEagerUnpackingOfOutput);
        return engine().makeTensorFromDataId(outInfo.dataId, outInfo.shape, outInfo.dtype);
    }
    getAndSaveBinary(key, getBinary) {
        if (!(key in this.binaryCache)) {
            this.binaryCache[key] = getBinary();
        }
        return this.binaryCache[key];
    }
    getTextureManager() {
        return this.textureManager;
    }
    dispose() {
        if (this.disposed) {
            return;
        }
        // Avoid disposing the compiled webgl programs during unit testing because
        // it slows down test execution.
        if (!env().getBool('IS_TEST')) {
            const allKeys = Object.keys(this.binaryCache);
            allKeys.forEach(key => {
                this.gpgpu.deleteProgram(this.binaryCache[key].webGLProgram);
                delete this.binaryCache[key];
            });
        }
        this.textureManager.dispose();
        if (this.canvas != null &&
            (typeof (HTMLCanvasElement) !== 'undefined' &&
                this.canvas instanceof HTMLCanvasElement)) {
            this.canvas.remove();
        }
        else {
            this.canvas = null;
        }
        if (this.gpgpuCreatedLocally) {
            this.gpgpu.program = null;
            this.gpgpu.dispose();
        }
        this.disposed = true;
    }
    floatPrecision() {
        if (this.floatPrecisionValue == null) {
            this.floatPrecisionValue = tidy(() => {
                if (!env().get('WEBGL_RENDER_FLOAT32_ENABLED')) {
                    // Momentarily switching DEBUG flag to false so we don't throw an
                    // error trying to upload a small value.
                    const debugFlag = env().getBool('DEBUG');
                    env().set('DEBUG', false);
                    const underflowCheckValue = this.abs(scalar(1e-8)).dataSync()[0];
                    env().set('DEBUG', debugFlag);
                    if (underflowCheckValue > 0) {
                        return 32;
                    }
                }
                return 16;
            });
        }
        return this.floatPrecisionValue;
    }
    /** Returns the smallest representable number.  */
    epsilon() {
        return this.floatPrecision() === 32 ? EPSILON_FLOAT32 : EPSILON_FLOAT16;
    }
    uploadToGPU(dataId) {
        const texData = this.texData.get(dataId);
        const { shape, dtype, values, texture, usage, isPacked } = texData;
        if (texture != null) {
            // Array is already on GPU. No-op.
            return;
        }
        const shouldTimeProgram = this.activeTimers != null;
        let start;
        if (shouldTimeProgram) {
            start = util.now();
        }
        let texShape = texData.texShape;
        if (texShape == null) {
            texShape = webgl_util.getTextureShapeFromLogicalShape(shape, isPacked);
            texData.texShape = texShape;
        }
        if (values != null) {
            const shapeAs3D = webgl_util.getShapeAs3D(shape);
            let program;
            let width = texShape[1], height = texShape[0];
            const isByteArray = values instanceof Uint8Array;
            if (isPacked) {
                [width, height] = tex_util.getPackedMatrixTextureShapeWidthHeight(texShape[0], texShape[1]);
                program = new EncodeMatrixPackedProgram(shapeAs3D, [height, width], isByteArray);
            }
            else {
                program =
                    new EncodeMatrixProgram(shapeAs3D, [height, width], isByteArray);
            }
            const tempDenseInputHandle = this.makeTensorInfo([height, width], dtype);
            if (isByteArray) {
                this.texData.get(tempDenseInputHandle.dataId).usage =
                    TextureUsage.PIXELS;
            }
            else {
                this.texData.get(tempDenseInputHandle.dataId).usage =
                    TextureUsage.UPLOAD;
            }
            this.gpgpu.uploadDenseMatrixToTexture(this.getTexture(tempDenseInputHandle.dataId), width, height, values);
            // We want the output to remain packed regardless of the value of
            // WEBGL_PACK.
            const preventEagerUnpacking = true;
            const encodedOutputTarget = this.runWebGLProgram(program, [tempDenseInputHandle], dtype, null, preventEagerUnpacking);
            // Have the original texture assume the identity of the encoded output.
            const outputTexData = this.texData.get(encodedOutputTarget.dataId);
            texData.texture = outputTexData.texture;
            texData.texShape = outputTexData.texShape;
            texData.isPacked = outputTexData.isPacked;
            texData.usage = outputTexData.usage;
            this.disposeIntermediateTensorInfo(tempDenseInputHandle);
            this.texData.delete(encodedOutputTarget.dataId);
            // Once uploaded, don't store the values on cpu.
            texData.values = null;
            if (shouldTimeProgram) {
                this.uploadWaitMs += util.now() - start;
            }
        }
        else {
            const newTexture = this.acquireTexture(texShape, usage, dtype, isPacked);
            texData.texture = newTexture;
        }
    }
    convertAndCacheOnCPU(dataId, float32Values) {
        const texData = this.texData.get(dataId);
        const { dtype } = texData;
        this.releaseGPUData(dataId);
        if (float32Values != null) {
            texData.values = float32ToTypedArray(float32Values, dtype);
        }
        return texData.values;
    }
    acquireTexture(texShape, texType, dtype, isPacked) {
        this.numBytesInGPU += this.computeBytes(texShape, dtype);
        if (!this.warnedAboutMemory &&
            this.numBytesInGPU > this.numMBBeforeWarning * 1024 * 1024) {
            const mb = (this.numBytesInGPU / 1024 / 1024).toFixed(2);
            this.warnedAboutMemory = true;
            console.warn(`High memory usage in GPU: ${mb} MB, ` +
                `most likely due to a memory leak`);
        }
        return this.textureManager.acquireTexture(texShape, texType, isPacked);
    }
    computeBytes(shape, dtype) {
        return shape[0] * shape[1] * util.bytesPerElement(dtype);
    }
    tryRunOnCpuOrThrow(inputs, fn) {
        if (this.shouldExecuteOnCPU(inputs)) {
            try {
                return fn();
            }
            catch (e) {
                if (env().getBool('IS_TEST')) {
                    throw new Error('CPU forwarding failed');
                }
            }
        }
        return null;
    }
}
function float32ToTypedArray(a, dtype) {
    if (dtype === 'float32' || dtype === 'complex64') {
        return a;
    }
    else if (dtype === 'int32' || dtype === 'bool') {
        const result = (dtype === 'int32') ? new Int32Array(a.length) :
            new Uint8Array(a.length);
        for (let i = 0; i < result.length; ++i) {
            result[i] = Math.round(a[i]);
        }
        return result;
    }
    else {
        throw new Error(`Unknown dtype ${dtype}`);
    }
}
//# sourceMappingURL=backend_webgl.js.map