#!/usr/bin/env julia using DelimitedFiles using KernelAbstractions, Adapt, OffsetArrays const BACKEND = :CUDA if BACKEND == :CUDA using CUDA, CUDAKernels const ArrayT = CuArray const Device = CUDADevice elseif BACKEND == :AMD using AMDGPU, ROCMKernels const ArrayT = CuArray const Device = CUDADevice else BACKEND == :CPU const ArrayT = Array const Device = CPU end function heat1d_ftcs_cpu(diffusivity, xlength, left_boundary, right_boundary, init_fn, dt, nsteps, dx, print_at_steps, outf, psource_x, psource_value) npoints = ceil(Int, xlength / dx) + 1 T1 = zeros(Float32, npoints) T2 = similar(T1) xs = range(0.0, xlength, length=npoints) c = diffusivity * dt / dx^2 psource_offset = ceil(Int, psource_x / xlength * npoints) psource_dt = psource_value * dt if c > 0.5 println("c = ", c) return -1 end T1[1] = left_boundary T1[end] = right_boundary T2[1] = left_boundary T2[end] = right_boundary T1[2:end-1] = init_fn.(xs[2:end-1]) # first line of output is x values, for plotting writedlm(outf, xs', ',') # output initial condition state writedlm(outf, T1', ',') for j in 1:2:nsteps for i in 2:npoints-1 T2[i] = T1[i] + c * (T1[i+1] - 2.0 * T1[i] + T1[i-1]) if i == psource_offset T2[i] += psource_dt end end for i in 2:npoints-1 T1[i] = T2[i] + c * (T2[i+1] - 2.0 * T2[i] + T2[i-1]) if i == psource_offset T1[i] += psource_dt end end if mod(j+1, print_at_steps) == 0 writedlm(outf, T1', ',') end end end @kernel function gpu_ftcs_step!(T, Told, c, psource_offset, psource_dt) i = @index(Global) + 1 #start_idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x + 1 #stride = gridDim().x * blockDim().x @inbounds Ti = Told[i] + c * (Told[i+1] - 2.0 * Told[i] + Told[i-1]) if i == psource_offset Ti += psource_dt end T[i] = Ti end function heat1d_ftcs_ka(diffusivity, xlength, left_boundary, right_boundary, init_fn, dt, nsteps, dx, print_at_steps, outf, psource_x, psource_value) npoints = Int(ceil(xlength / dx)) + 1 T = zeros(Float32, npoints) xs = range(0.0, xlength, length=npoints) T1_d = CuArray{Float32}(undef, length(T)) T2_d = similar(T1_d) c = diffusivity * dt / dx^2 psource_offset = ceil(Int, psource_x / xlength * npoints) psource_dt = psource_value * dt if c > 0.5 println("c = ", c) return -1 end T[1] = left_boundary T[end] = right_boundary T[2:end-1] = init_fn.(xs[2:end-1]) # first line of output is x values, for plotting writedlm(outf, xs', ',') # output initial condition state writedlm(outf, T', ',') copyto!(T1_d, T) NTHREADS = 128 NBLOCKS = ceil(Int, (length(T)-2) / NTHREADS) kstep = gpu_ftcs_step!(Device()) for j in 1:2:nsteps stepa = kstep(T2_d, T1_d, c, psource_offset, psource_dt; ndrange=length(T)-2) stepb = kstep(T1_d, T2_d, c, psource_offset, psource_dt; ndrange=length(T)-2, dependencies=stepa) wait(stepb) if mod(j+1, print_at_steps) == 0 copyto!(T, T1_d) writedlm(outf, T', ',') end end end function gaussian1d(x) return exp(-(5.0*(x-0.5))^2) end function test_ka() CUDA.allowscalar(false) open("out/heat1d_ka.txt", "w") do io heat1d_ftcs_ka(0.001, 1.0, 0.0, 0.0, gaussian1d, 0.0004, 2, 2^-10, 2^13, io, 0.2, 1.0) end end test_ka() #@time test_gpu()