From 68a28432e51e8a079509cb8503fa3a5615d9c938 Mon Sep 17 00:00:00 2001 From: Bryce Allen Date: Sun, 26 Dec 2021 17:32:02 -0500 Subject: [PATCH] add KernelAbstractions port, WIP --- heat1d_ka.jl | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100755 heat1d_ka.jl diff --git a/heat1d_ka.jl b/heat1d_ka.jl new file mode 100755 index 0000000..df069b8 --- /dev/null +++ b/heat1d_ka.jl @@ -0,0 +1,147 @@ +#!/usr/bin/env julia + +using DelimitedFiles +using KernelAbstractions, Adapt, OffsetArrays + +const BACKEND = :CUDA + +if BACKEND == :CUDA + using CUDA, CUDAKernels + const ArrayT = CuArray + const Device = CUDADevice +elseif BACKEND == :AMD + using AMDGPU, ROCMKernels + const ArrayT = CuArray + const Device = CUDADevice +else BACKEND == :CPU + const ArrayT = Array + const Device = CPU +end + +function heat1d_ftcs_cpu(diffusivity, xlength, left_boundary, + right_boundary, init_fn, + dt, nsteps, dx, print_at_steps, outf, + psource_x, psource_value) + npoints = ceil(Int, xlength / dx) + 1 + T1 = zeros(Float32, npoints) + T2 = similar(T1) + xs = range(0.0, xlength, length=npoints) + + c = diffusivity * dt / dx^2 + psource_offset = ceil(Int, psource_x / xlength * npoints) + psource_dt = psource_value * dt + + if c > 0.5 + println("c = ", c) + return -1 + end + + T1[1] = left_boundary + T1[end] = right_boundary + T2[1] = left_boundary + T2[end] = right_boundary + + T1[2:end-1] = init_fn.(xs[2:end-1]) + + # first line of output is x values, for plotting + writedlm(outf, xs', ',') + + # output initial condition state + writedlm(outf, T1', ',') + + for j in 1:2:nsteps + for i in 2:npoints-1 + T2[i] = T1[i] + c * (T1[i+1] - 2.0 * T1[i] + T1[i-1]) + if i == psource_offset + + T2[i] += psource_dt + end + end + for i in 2:npoints-1 + T1[i] = T2[i] + c * (T2[i+1] - 2.0 * T2[i] + T2[i-1]) + if i == psource_offset + T1[i] += psource_dt + end + end + if mod(j+1, print_at_steps) == 0 + writedlm(outf, T1', ',') + end + end +end + +@kernel function gpu_ftcs_step!(T, Told, c, psource_offset, psource_dt) + i = @index(Global) + 1 + #start_idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x + 1 + #stride = gridDim().x * blockDim().x + @inbounds Ti = Told[i] + c * (Told[i+1] - 2.0 * Told[i] + Told[i-1]) + if i == psource_offset + Ti += psource_dt + end + T[i] = Ti +end + +function heat1d_ftcs_ka(diffusivity, xlength, left_boundary, + right_boundary, init_fn, + dt, nsteps, dx, print_at_steps, outf, + psource_x, psource_value) + npoints = Int(ceil(xlength / dx)) + 1 + T = zeros(Float32, npoints) + xs = range(0.0, xlength, length=npoints) + + T1_d = CuArray{Float32}(undef, length(T)) + T2_d = similar(T1_d) + + c = diffusivity * dt / dx^2 + psource_offset = ceil(Int, psource_x / xlength * npoints) + psource_dt = psource_value * dt + + if c > 0.5 + println("c = ", c) + return -1 + end + + T[1] = left_boundary + T[end] = right_boundary + + T[2:end-1] = init_fn.(xs[2:end-1]) + + # first line of output is x values, for plotting + writedlm(outf, xs', ',') + + # output initial condition state + writedlm(outf, T', ',') + + copyto!(T1_d, T) + + NTHREADS = 128 + NBLOCKS = ceil(Int, (length(T)-2) / NTHREADS) + + kstep = gpu_ftcs_step!(Device()) + + for j in 1:2:nsteps + stepa = kstep(T2_d, T1_d, c, psource_offset, psource_dt; + ndrange=length(T)-2) + stepb = kstep(T1_d, T2_d, c, psource_offset, psource_dt; + ndrange=length(T)-2, dependencies=stepa) + wait(stepb) + if mod(j+1, print_at_steps) == 0 + copyto!(T, T1_d) + writedlm(outf, T', ',') + end + end +end + +function gaussian1d(x) + return exp(-(5.0*(x-0.5))^2) +end + +function test_ka() + CUDA.allowscalar(false) + open("out/heat1d_ka.txt", "w") do io + heat1d_ftcs_ka(0.001, 1.0, 0.0, 0.0, gaussian1d, + 0.0004, 2, 2^-10, 2^13, io, 0.2, 1.0) + end +end + +test_ka() +#@time test_gpu()