using CUDA,LatticeGPU, TimerOutputs

#Check that Dw ( (DwdagDw)^{-1} g5 Dw g5 ) psi = psi for random fields

@timeit "Rand solver test" begin

function pfrandomize!(f,lp::SpaceParm,dpar::DiracParam)

    if dpar.rep == SU3fund && lp.ndim == 4
        begin
            p = ntuple(i->CUDA.randn(Float64, lp.bsz, 3, lp.rsz,2),4)
            CUDA.@sync begin
                CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_assign_pf!(f,p,lp)
            end
        end
    end

    return nothing

end
function krnl_assign_pf!(f::AbstractArray{T}, p ,lp::SpaceParm) where {T} 
    
    @inbounds begin
        b = Int64(CUDA.threadIdx().x)
        r = Int64(CUDA.blockIdx().x)

            f[b,r] = Spinor(map(x->SU3fund(x[b,1,r,1] + im* x[b,1,r,2],
                                           x[b,2,r,1] + im* x[b,2,r,2],
                                           x[b,3,r,1] + im* x[b,3,r,2]),p))
    end
    return nothing

end

@timeit "Generate random fields" begin

lp = SpaceParm{4}((16,16,16,16), (4,4,4,4), 0, (0,0,0,0,0,0))
gp = GaugeParm{Float64}(SU3{Float64}, 6.0, 1.0)
ymws = YMworkspace(SU3, Float64, lp)
dpar = DiracParam{Float64}(SU3fund,2.3,0.0,(1.0,1.0,1.0,1.0),0.0)
dws = DiracWorkspace(SU3fund{Float64},Float64,lp);


randomize!(ymws.mom, lp, ymws)
U = exp.(ymws.mom)


rpsi = scalar_field(Spinor{4,SU3fund{Float64}},lp)
pfrandomize!(rpsi,lp,dpar)

end

prop = scalar_field(Spinor{4,SU3fund{Float64}},lp)

function krnlg5!(src)
    b=Int64(CUDA.threadIdx().x)
    r=Int64(CUDA.blockIdx().x)
    src[b,r] = dmul(Gamma{5},src[b,r])
    return nothing
end

CUDA.@sync begin
        CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnlg5!(rpsi)
end
g5Dw!(prop,U,rpsi,dpar,lp)
CG!(prop,U,DwdagDw!,dpar,lp,dws,ymws,gp,10000,1.0e-14)

Dw!(dws.sp,U,prop,dpar,lp)

CUDA.@sync begin
    CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnlg5!(rpsi)
end

res = sum(norm2.(rpsi-dws.sp))


if res < 1.0e-15 
    print("Drand test passed with ",res,"% error!\n")
    
else
    error("Drand test failed with difference: ",res,"\n")
end

end