#!/usr/bin/env julia

using DelimitedFiles
using KernelAbstractions, Adapt, OffsetArrays

const BACKEND = :CUDA

if BACKEND == :CUDA
    using CUDA, CUDAKernels
    const ArrayT = CuArray
    const Device = CUDADevice
elseif BACKEND == :AMD
    using AMDGPU, ROCMKernels
    const ArrayT = CuArray
    const Device = CUDADevice
else BACKEND == :CPU
    const ArrayT = Array
    const Device = CPU
end

function heat1d_ftcs_cpu(diffusivity, xlength, left_boundary,
                         right_boundary, init_fn,
                         dt, nsteps, dx, print_at_steps, outf,
                         psource_x, psource_value)
    npoints = ceil(Int, xlength / dx) + 1
    T1 = zeros(Float32, npoints)
    T2 = similar(T1)
    xs = range(0.0, xlength, length=npoints)

    c = diffusivity * dt / dx^2
    psource_offset = ceil(Int, psource_x / xlength * npoints)
    psource_dt = psource_value * dt

    if c > 0.5
        println("c = ", c)
        return -1
    end

    T1[1] = left_boundary
    T1[end] = right_boundary
    T2[1] = left_boundary
    T2[end] = right_boundary

    T1[2:end-1] = init_fn.(xs[2:end-1])

    # first line of output is x values, for plotting
    writedlm(outf, xs', ',')

    # output initial condition state
    writedlm(outf, T1', ',')

    for j in 1:2:nsteps
        for i in 2:npoints-1
            T2[i] = T1[i] + c * (T1[i+1] - 2.0 * T1[i] + T1[i-1])
            if i == psource_offset

                T2[i] += psource_dt
            end
        end
        for i in 2:npoints-1
            T1[i] = T2[i] + c * (T2[i+1] - 2.0 * T2[i] + T2[i-1])
            if i == psource_offset
                T1[i] += psource_dt
            end
        end
        if mod(j+1, print_at_steps) == 0
            writedlm(outf, T1', ',')
        end
    end
end

@kernel function gpu_ftcs_step!(T, Told, c, psource_offset, psource_dt)
    i = @index(Global) + 1
    #start_idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x + 1
    #stride = gridDim().x * blockDim().x
    @inbounds Ti = Told[i] + c * (Told[i+1] - 2.0 * Told[i] + Told[i-1])
    if i == psource_offset
        Ti += psource_dt
    end
    T[i] = Ti
end

function heat1d_ftcs_ka(diffusivity, xlength, left_boundary,
                        right_boundary, init_fn,
                        dt, nsteps, dx, print_at_steps, outf,
                        psource_x, psource_value)
    npoints = Int(ceil(xlength / dx)) + 1
    T = zeros(Float32, npoints)
    xs = range(0.0, xlength, length=npoints)

    T1_d = CuArray{Float32}(undef, length(T))
    T2_d = similar(T1_d)

    c = diffusivity * dt / dx^2
    psource_offset = ceil(Int, psource_x / xlength * npoints)
    psource_dt = psource_value * dt

    if c > 0.5
        println("c = ", c)
        return -1
    end

    T[1] = left_boundary
    T[end] = right_boundary

    T[2:end-1] = init_fn.(xs[2:end-1])

    # first line of output is x values, for plotting
    writedlm(outf, xs', ',')

    # output initial condition state
    writedlm(outf, T', ',')

    copyto!(T1_d, T)

    NTHREADS = 128
    NBLOCKS = ceil(Int, (length(T)-2) / NTHREADS)

    kstep = gpu_ftcs_step!(Device())

    for j in 1:2:nsteps
        stepa = kstep(T2_d, T1_d, c, psource_offset, psource_dt;
                      ndrange=length(T)-2)
        stepb = kstep(T1_d, T2_d, c, psource_offset, psource_dt;
                      ndrange=length(T)-2, dependencies=stepa)
        wait(stepb)
        if mod(j+1, print_at_steps) == 0
            copyto!(T, T1_d)
            writedlm(outf, T', ',')
        end
    end
end

function gaussian1d(x)
    return exp(-(5.0*(x-0.5))^2)
end

function test_ka()
    CUDA.allowscalar(false)
    open("out/heat1d_ka.txt", "w") do io
        heat1d_ftcs_ka(0.001, 1.0, 0.0, 0.0, gaussian1d,
                       0.0004, 2, 2^-10, 2^13, io, 0.2, 1.0)
    end
end

test_ka()
#@time test_gpu()