From 68a28432e51e8a079509cb8503fa3a5615d9c938 Mon Sep 17 00:00:00 2001
From: Bryce Allen <bdallen@uchicago.edu>
Date: Sun, 26 Dec 2021 17:32:02 -0500
Subject: [PATCH] add KernelAbstractions port, WIP

---
 heat1d_ka.jl | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100755 heat1d_ka.jl

diff --git a/heat1d_ka.jl b/heat1d_ka.jl
new file mode 100755
index 0000000..df069b8
--- /dev/null
+++ b/heat1d_ka.jl
@@ -0,0 +1,147 @@
+#!/usr/bin/env julia
+
+using DelimitedFiles
+using KernelAbstractions, Adapt, OffsetArrays
+
+const BACKEND = :CUDA
+
+if BACKEND == :CUDA
+    using CUDA, CUDAKernels
+    const ArrayT = CuArray
+    const Device = CUDADevice
+elseif BACKEND == :AMD
+    using AMDGPU, ROCMKernels
+    const ArrayT = CuArray
+    const Device = CUDADevice
+else BACKEND == :CPU
+    const ArrayT = Array
+    const Device = CPU
+end
+
+function heat1d_ftcs_cpu(diffusivity, xlength, left_boundary,
+                         right_boundary, init_fn,
+                         dt, nsteps, dx, print_at_steps, outf,
+                         psource_x, psource_value)
+    npoints = ceil(Int, xlength / dx) + 1
+    T1 = zeros(Float32, npoints)
+    T2 = similar(T1)
+    xs = range(0.0, xlength, length=npoints)
+
+    c = diffusivity * dt / dx^2
+    psource_offset = ceil(Int, psource_x / xlength * npoints)
+    psource_dt = psource_value * dt
+
+    if c > 0.5
+        println("c = ", c)
+        return -1
+    end
+
+    T1[1] = left_boundary
+    T1[end] = right_boundary
+    T2[1] = left_boundary
+    T2[end] = right_boundary
+
+    T1[2:end-1] = init_fn.(xs[2:end-1])
+
+    # first line of output is x values, for plotting
+    writedlm(outf, xs', ',')
+
+    # output initial condition state
+    writedlm(outf, T1', ',')
+
+    for j in 1:2:nsteps
+        for i in 2:npoints-1
+            T2[i] = T1[i] + c * (T1[i+1] - 2.0 * T1[i] + T1[i-1])
+            if i == psource_offset
+
+                T2[i] += psource_dt
+            end
+        end
+        for i in 2:npoints-1
+            T1[i] = T2[i] + c * (T2[i+1] - 2.0 * T2[i] + T2[i-1])
+            if i == psource_offset
+                T1[i] += psource_dt
+            end
+        end
+        if mod(j+1, print_at_steps) == 0
+            writedlm(outf, T1', ',')
+        end
+    end
+end
+
+@kernel function gpu_ftcs_step!(T, Told, c, psource_offset, psource_dt)
+    i = @index(Global) + 1
+    #start_idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x + 1
+    #stride = gridDim().x * blockDim().x
+    @inbounds Ti = Told[i] + c * (Told[i+1] - 2.0 * Told[i] + Told[i-1])
+    if i == psource_offset
+        Ti += psource_dt
+    end
+    T[i] = Ti
+end
+
+function heat1d_ftcs_ka(diffusivity, xlength, left_boundary,
+                        right_boundary, init_fn,
+                        dt, nsteps, dx, print_at_steps, outf,
+                        psource_x, psource_value)
+    npoints = Int(ceil(xlength / dx)) + 1
+    T = zeros(Float32, npoints)
+    xs = range(0.0, xlength, length=npoints)
+
+    T1_d = CuArray{Float32}(undef, length(T))
+    T2_d = similar(T1_d)
+
+    c = diffusivity * dt / dx^2
+    psource_offset = ceil(Int, psource_x / xlength * npoints)
+    psource_dt = psource_value * dt
+
+    if c > 0.5
+        println("c = ", c)
+        return -1
+    end
+
+    T[1] = left_boundary
+    T[end] = right_boundary
+
+    T[2:end-1] = init_fn.(xs[2:end-1])
+
+    # first line of output is x values, for plotting
+    writedlm(outf, xs', ',')
+
+    # output initial condition state
+    writedlm(outf, T', ',')
+
+    copyto!(T1_d, T)
+
+    NTHREADS = 128
+    NBLOCKS = ceil(Int, (length(T)-2) / NTHREADS)
+
+    kstep = gpu_ftcs_step!(Device())
+
+    for j in 1:2:nsteps
+        stepa = kstep(T2_d, T1_d, c, psource_offset, psource_dt;
+                      ndrange=length(T)-2)
+        stepb = kstep(T1_d, T2_d, c, psource_offset, psource_dt;
+                      ndrange=length(T)-2, dependencies=stepa)
+        wait(stepb)
+        if mod(j+1, print_at_steps) == 0
+            copyto!(T, T1_d)
+            writedlm(outf, T', ',')
+        end
+    end
+end
+
+function gaussian1d(x)
+    return exp(-(5.0*(x-0.5))^2)
+end
+
+function test_ka()
+    CUDA.allowscalar(false)
+    open("out/heat1d_ka.txt", "w") do io
+        heat1d_ftcs_ka(0.001, 1.0, 0.0, 0.0, gaussian1d,
+                       0.0004, 2, 2^-10, 2^13, io, 0.2, 1.0)
+    end
+end
+
+test_ka()
+#@time test_gpu()