using LatticeGPU using TOML using TimerOutputs using ArgParse using CUDA function load_fields() global U = vector_field(SU3{Float64}, lp) fill!(U,one(SU3{Float64})); global U_CPU = Array(U) global psi = scalar_field(Spinor{4,SU3fund{Float64,Nder}},lp) fill!(psi,zero(Spinor{4, SU3fund{Float64, Nder}})) global psi_CPU = Array(psi) global psit_CPU = Array(psi) global pp_density = Array{Series{Float64,Nder}}(undef,lp.bsz,lp.rsz); global ap_density = Array{Series{Complex{Float64},Nder}}(undef,lp.bsz,lp.rsz); global complex_scalar = scalar_field_point(Series{ComplexF64, Nder}, lp) global real_scalar = scalar_field_point(Series{Float64, Nder}, lp) # Obs ending in 0 are only relevant if tzero > 0 global pp_corr_t0 = fill(zero(Series{Float64,Nder}),(lp.iL[4],params["Frontflow"]["N_noise"],length(dpar))); global ap_corr_t0 = fill(zero(Series{Complex{Float64},Nder}),(lp.iL[4],params["Frontflow"]["N_noise"],length(dpar))); global pphat_t0 = Array{Series{Float64,Nder}}(undef,lp.iL[4],params["Frontflow"]["N_noise"],length(dpar)); global pptilde_t0 = Array{Series{Complex{Float64},Nder}}(undef,lp.iL[4],params["Frontflow"]["N_noise"],length(dpar)); global Qt0 = Array{Complex{Float64}}(undef,lp.iL[4]); global Eoft0 = Array{Complex{Float64}}(undef,lp.iL[4]); # Frontflow global pp_corr_t = fill(zero(Series{Float64,Nder}),(lp.iL[4],params["Frontflow"]["N_noise"],params["Frontflow"]["nsteps"] + 1,length(dpar))); global ap_corr_t = fill(zero(Series{Complex{Float64},Nder}),(lp.iL[4],params["Frontflow"]["N_noise"],params["Frontflow"]["nsteps"] + 1,length(dpar))); global pp_corr_tfl = fill(zero(Series{Float64,Nder}),(lp.iL[4],params["Backflow"]["N_noise"],length(flow_times),length(dpar))); global ap_corr_tfl = fill(zero(Series{Complex{Float64},Nder}),(lp.iL[4],params["Backflow"]["N_noise"],length(flow_times),length(dpar))); global pphat_t = Array{Series{Float64,Nder}}(undef,lp.iL[4],params["Frontflow"]["N_noise"],params["Frontflow"]["nsteps"] + 1,length(dpar)); global pptilde_t = Array{Series{Complex{Float64},Nder}}(undef,lp.iL[4],params["Frontflow"]["N_noise"],params["Frontflow"]["nsteps"] + 1,length(dpar)); global Qteuc = Array{Float64}(undef,lp.iL[4]); global Qt = Array{Complex{Float64}}(undef,params["Frontflow"]["nsteps"] + 1,lp.iL[4]); global Eoft = Array{Complex{Float64}}(undef,params["Frontflow"]["nsteps"] + 1,lp.iL[4]); global Eofpla = Array{Complex{Float64}}(undef,lp.iL[4],lp.npls); # Backflow global Quark_cond = Array{Series{Complex{Float64},Nder}}(undef,lp.iL[4],params["Backflow"]["N_noise"],length(flow_times),length(dpar)); global Quark_cond_cfl = Array{Series{Complex{Float64},Nder}}(undef,lp.iL[4],params["Backflow"]["N_noise"],length(flow_times),length(dpar)); global Quark_cond2 = Array{Series{Complex{Float64},Nder}}(undef,lp.iL[4],params["Backflow"]["N_noise"],length(flow_times),length(dpar)); global Quark_cond2_cfl = Array{Series{Complex{Float64},Nder}}(undef,lp.iL[4],params["Backflow"]["N_noise"],length(flow_times),length(dpar)); global ChiDchi = Array{Series{Complex{Float64},Nder}}(undef,lp.iL[4],params["Backflow"]["N_noise"],length(flow_times),length(dpar)); global ChiDchi_cfl = Array{Series{Complex{Float64},Nder}}(undef,lp.iL[4],params["Backflow"]["N_noise"],length(flow_times),length(dpar)); # Sfcf fields if params["sfcf"]["meas"] global A11 = Array(psi) global A21 = Array(psi) global A31 = Array(psi) global A12 = Array(psi) global A22 = Array(psi) global A32 = Array(psi) global fp_sf = Array{Series{Float64,Nder}}(undef,lp.iL[4],length(dpar)); global fa_sf = Array{Series{ComplexF64,Nder}}(undef,lp.iL[4],length(dpar)); global kv_sf = Array{Series{ComplexF64,Nder}}(undef,lp.iL[4],length(dpar)); global kt_sf = Array{Series{ComplexF64,Nder}}(undef,lp.iL[4],length(dpar)); global f1_sf = Array{Series{Float64,Nder}}(undef,length(dpar)); global k1_sf = Array{Series{ComplexF64,Nder}}(undef,length(dpar)); end return nothing end """ function two_pt() Computes the correlation functions involving Frontflow of fermionic fields. Stores the two point function in the variables 'pp_corr_t0', 'ap_corr_t0', 'pp_corr_t' and 'ap_corr_t' """ function Frontflow_pt() # Will be Frontflow for f in 1:length(dpar) println(log_file,"\nMeasuring 2pt for Fermion"*fernames[f]*"...") flush(log_file) Eoft_plaq(Eofpla,U, gp, lp, ymws) Eoft0 .= sum(Eofpla,dims = 2) Qtop(Qteuc,U, gp, lp, ymws) Qt0 .= Qteuc for noi in 1:params["Frontflow"]["N_noise"] niter = propagator!(psi,U, dpar[f], dws, lp,params["Fermion"*fernames[f]]["maxiter"], params["Fermion"*fernames[f]]["tolerance"],params["Frontflow"]["tsource"]) println(log_file,"CG converged after ",niter," iterations with residue ",CUDA.mapreduce(x -> norm2(x), +, dws.sr)) flush(log_file) pp_corr_t0[:,noi,f] .= zero(Series{Float64,Nder}) ap_corr_t0[:,noi,f] .= zero(Series{ComplexF64,Nder}) @timeit "Volume sum" begin pp_corr_t0[:,noi,f] .= scalar_contraction(psi) ap_corr_t0[:,noi,f] .= gammazero_contraction(psi) end if params["Frontflow"]["t_zero"]>0.0 _,epslist = flw_adapt(U, psi, int, params["Frontflow"]["t_zero"], gp, dpar[f], lp, ymws, dws) println(log_file,"Flowed correlator to t = ",params["Frontflow"]["t_zero"]," with last stepsize ",epslist[end-1]) flush(log_file) end for tstep in 1:params["Frontflow"]["nsteps"] if noi ==1 Eoft_plaq(Eofpla,U, gp, lp, ymws) Eoft[tstep,:] .= sum(Eofpla,dims = 2) Qtop(Qteuc, U, gp, lp, ymws) Qt[tstep,:] .= Qteuc end pp_corr_t[:,noi,tstep,f] .= zero(Series{Float64,Nder}) ap_corr_t[:,noi,tstep,f] .= zero(Series{ComplexF64,Nder}) @timeit "Volume sum" begin pp_corr_t[:,noi,tstep,f] .= scalar_contraction(psi) ap_corr_t[:,noi,tstep,f] .= gammazero_contraction(psi) end flw(U, psi, int, 1, params["Frontflow"]["epsilon"], gp, dpar[f], lp, ymws, dws) end if noi ==1 Eoft_plaq(Eofpla,U, gp, lp, ymws) Eoft[end,:] .= sum(Eofpla,dims = 2) Qtop(Qteuc, U, gp, lp, ymws) Qt[end,:] .= Qteuc end pp_corr_t[:,noi,end,f] .= zero(Series{Float64,Nder}) ap_corr_t[:,noi,end,f] .= zero(Series{ComplexF64,Nder}) @timeit "Volume sum" begin pp_corr_t[:,noi,end,f] .= scalar_contraction(psi) ap_corr_t[:,noi,end,f] .= gammazero_contraction(psi) end @timeit "CPU to GPU" copyto!(U,U_CPU) end end pp_corr_t0 .= pp_corr_t0./prod(lp.iL[1:3]) ap_corr_t0 .= ap_corr_t0./prod(lp.iL[1:3]) pp_corr_t .= pp_corr_t./prod(lp.iL[1:3]) ap_corr_t .= ap_corr_t./prod(lp.iL[1:3]) println(log_file,"Finished measuring 2pt") flush(log_file) return nothing end """ function Backflow_pt() Computes the correlation functions involving Backflow of fermionic fields. Stores the one point function in the variable 'Quark_cond' and 'Quark_cond_cfl'. The final result for the quark condensate is 'Quark_cond' + C_fl*'Quark_cond_cfl'. The two point function with source in possitive flow time is also stored in 'pp_corr_tfl' and 'ap_corr_tfl'. The correlation function 'ChiDchi' is also updated, as well as a second computation of the condensate. """ function Backflow_pt() # Will be backflow @timeit "CPU to GPU" copyto!(U,U_CPU) for f in 1:length(dpar) for fl in 1:length(flow_times) println(log_file,"\nMeasuring 1pt at t = "*string(flow_times[fl])*" for Fermion"*fernames[f]*"...") flush(log_file) for noi in 1:params["Backflow"]["N_noise"] ## The change in the field before randomize should be moved to the package psi .= 0.0*psi pfrandomize!(psi,lp,params["Backflow"]["tsource"]) @timeit "GPU to CPU" psit_CPU .= Array(psi) backflow(psi, U, flow_times[fl], params["Backflow"]["Nsaves"] , gp, dpar[f], lp, ymws, dws) @timeit "GPU to CPU" psi_CPU .= Array(psi) @timeit "Inversion" begin dws.sp .= dmul.(Gamma{5},psi) g5Dw!(psi,U,dws.sp,dpar[f],dws,lp) niter = CG!(psi,U,DwdagDw!,dpar[f],lp,dws,params["Fermion"*fernames[f]]["maxiter"], params["Fermion"*fernames[f]]["tolerance"]) end println(log_file,"CG converged after ",niter," iterations with residue ",CUDA.mapreduce(x -> norm2(x), +, dws.sr)) flush(log_file) @timeit "CPU to GPU" copyto!(dws.st,psi_CPU) Quark_cond_cfl[:,noi,fl,f] .= zero(Series{ComplexF64,Nder}) ap_density .= Array(norm2.(dws.st)) @timeit "Volume sum" for b in 1:lp.bsz for r in 1:lp.rsz t = point_time((b,r),lp) Quark_cond_cfl[t,noi,fl,f] += -ap_density[b,r] end end Quark_cond[:,noi,fl,f] .= zero(Series{ComplexF64,Nder}) @timeit "Volume sum" begin Quark_cond[:,noi,fl,f] .= dot_contraction(dws.st,psi) end pp_corr_tfl[:,noi,fl,f] .= zero(Series{Float64,Nder}) ap_corr_tfl[:,noi,fl,f] .= zero(Series{ComplexF64,Nder}) @timeit "Volume sum" begin pp_corr_tfl[:,noi,fl,f] .= scalar_contraction(psi) ap_corr_tfl[:,noi,fl,f] .= gammazero_contraction(psi) end @timeit "CPU to GPU" copyto!(U,U_CPU) @timeit "ChiDchi" begin flw_adapt(U, psi, int, flow_times[fl], gp, dpar[f], lp, ymws, dws) Dw!(dws.sp,U,psi,dpar[f],dws,lp) @timeit "CPU to GPU" copyto!(dws.st,psit_CPU) ChiDchi[:,noi,fl,f] .= zero(Series{Complex{Float64},Nder}) ap_density .= Array(dot.(dws.st,dws.sp)) @timeit "Volume sum" for b in 1:lp.bsz for r in 1:lp.rsz t = point_time((b,r),lp) ChiDchi[t,noi,fl,f] += ap_density[b,r] end end Dw!(dws.sp,U,dws.st,dpar[f],dws,lp) ap_density .= Array(dot.(dws.sp,psi)) @timeit "Volume sum" for b in 1:lp.bsz for r in 1:lp.rsz t = point_time((b,r),lp) ChiDchi[t,noi,fl,f] += -ap_density[b,r] end end Quark_cond2[:,noi,fl,f] .= zero(Series{ComplexF64,Nder}) ap_density .= Array(dot.(dws.st,psi)) @timeit "Volume sum" for b in 1:lp.bsz for r in 1:lp.rsz t = point_time((b,r),lp) Quark_cond2[t,noi,fl,f] += ap_density[b,r] end end @timeit "CPU to GPU" copyto!(U,U_CPU) @timeit "CPU to GPU" copyto!(psi,psi_CPU) flw_adapt(U, psi, int, flow_times[fl], gp, dpar[f], lp, ymws, dws) Dw!(dws.sp,U,psi,dpar[f],dws,lp) ChiDchi_cfl[:,noi,fl,f] .= zero(Series{ComplexF64,Nder}) ap_density .= Array(dot.(dws.st,dws.sp)) @timeit "Volume sum" for b in 1:lp.bsz for r in 1:lp.rsz t = point_time((b,r),lp) ChiDchi_cfl[t,noi,fl,f] += ap_density[b,r] end end Dw!(dws.sp,U,dws.st,dpar[f],dws,lp) ap_density .= Array(dot.(dws.sp,psi)) @timeit "Volume sum" for b in 1:lp.bsz for r in 1:lp.rsz t = point_time((b,r),lp) ChiDchi_cfl[t,noi,fl,f] += -ap_density[b,r] end end Quark_cond2_cfl[:,noi,fl,f] .= zero(Series{ComplexF64,Nder}) ap_density .= Array(dot.(dws.st,psi)) @timeit "Volume sum" for b in 1:lp.bsz for r in 1:lp.rsz t = point_time((b,r),lp) Quark_cond2_cfl[t,noi,fl,f] += ap_density[b,r] end end end @timeit "CPU to GPU" copyto!(U,U_CPU) end end end Quark_cond .= Quark_cond./prod(lp.iL[1:3]) Quark_cond_cfl .= Quark_cond_cfl./prod(lp.iL[1:3]) pp_corr_tfl .= pp_corr_tfl./prod(lp.iL[1:3]) ap_corr_tfl .= ap_corr_tfl./prod(lp.iL[1:3]) ChiDchi .= ChiDchi./prod(lp.iL[1:3]) Quark_cond2 .= Quark_cond2./prod(lp.iL[1:3]) ChiDchi_cfl .= ChiDchi_cfl./prod(lp.iL[1:3]) Quark_cond2_cfl .= Quark_cond2_cfl./prod(lp.iL[1:3]) println(log_file,"Finished measuring 1pt \n") return nothing end """ function two_pt_lagrange() Computes the two point functions containing Lagrange multiplier fields using front flow. The convention is such that the quark condentate is 'pptilde_t' + c_fl*'pphat_t' Stores the two point functions in the variables 'pphat_t' and 'pptilde_t'. """ function Two_pt_lagrange() # Will be 2pt lagrange mult @timeit "CPU to GPU" copyto!(U,U_CPU) for f in 1:length(dpar) println(log_file,"\nMeasuring Lagrange multiplier correlators for Fermion"*fernames[f]*"...") flush(log_file) for noi in 1:params["Frontflow"]["N_noise"] pfrandomize!(psi,lp) @timeit "GPU to CPU" psi_CPU .= Array(psi) @timeit "Inversion" begin dws.sp .= dmul.(Gamma{5},psi) g5Dw!(psi,U,dws.sp,dpar[f],dws,lp) niter = CG!(psi,U,DwdagDw!,dpar[f],lp,dws,params["Fermion"*fernames[f]]["maxiter"], params["Fermion"*fernames[f]]["tolerance"]) end println(log_file,"CG converged after ",niter," iterations with residue ",CUDA.mapreduce(x -> norm2(x), +, dws.sr)) flush(log_file) pphat_t0[:,noi,f] .= zero(Series{Float64,Nder}) pptilde_t0[:,noi,f] .= zero(Series{ComplexF64,Nder}) @timeit "Volume sum" begin pphat_t0[:,noi,f] .= -scalar_contraction(psi) pptilde_t0[:,noi,f] .= dot_contraction(dws.st,psi) end _,epslist = flw_adapt(U, psi, int, params["Frontflow"]["t_zero"], gp, dpar[f], lp, ymws, dws) @timeit "GPU to CPU" psit_CPU .= Array(psi) @timeit "CPU to GPU" copyto!(U,U_CPU) @timeit "CPU to GPU" copyto!(psi,psi_CPU) if params["Frontflow"]["t_zero"]>0.0 _,epslist = flw_adapt(U, psi, int, params["Frontflow"]["t_zero"], gp, dpar[f], lp, ymws, dws) println(log_file,"Flowed correlator to t = ",params["Frontflow"]["t_zero"]," with last stepsize ",epslist[end-1]) flush(log_file) end @timeit "CPU to GPU" copyto!(dws.st,psit_CPU) for fl in 1:params["Frontflow"]["nsteps"] pphat_t[:,noi,fl,f] .= zero(Series{Float64,Nder}) pptilde_t[:,noi,fl,f] .= zero(Series{ComplexF64,Nder}) @timeit "Volume sum" begin pphat_t[:,noi,fl,f] .= -scalar_contraction(psi) pptilde_t[:,noi,fl,f] .= dot_contraction(dws.st,psi) end ymws.U1 .= U flw(U, psi, int, 1, params["Frontflow"]["epsilon"], gp, dpar[f], lp, ymws, dws) U .= ymws.U1 flw(U, dws.st, int, 1, params["Frontflow"]["epsilon"], gp, dpar[f], lp, ymws, dws) end pphat_t[:,noi,end,f] .= zero(Series{Float64,Nder}) pptilde_t[:,noi,end,f] .= zero(Series{ComplexF64,Nder}) @timeit "Volume sum" begin pphat_t[:,noi,end,f] .= -scalar_contraction(psi) pptilde_t[:,noi,end,f] .= dot_contraction(dws.st,psi) end @timeit "CPU to GPU" copyto!(U,U_CPU) end end pphat_t0 .= pphat_t0./prod(lp.iL[1:3]) pptilde_t0 .= pptilde_t0./prod(lp.iL[1:3]) pphat_t .= pphat_t./prod(lp.iL[1:3]) pptilde_t .= pptilde_t./prod(lp.iL[1:3]) println(log_file,"Finished measuring Lagrange multiplier correlators") flush(log_file) return nothing end function krnl_scalar_contraction!(rm, psi, lp) @inbounds begin b = Int64(CUDA.threadIdx().x) r = Int64(CUDA.blockIdx().x) I = point_coord((b,r), lp) rm[I] = ctoreal(norm2(psi[b,r])) end return nothing end function scalar_contraction(psi) CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_scalar_contraction!(real_scalar, psi, lp) end return reshape(Array(CUDA.reduce(+, real_scalar; dims=(1,2,3))),lp.iL[end]) end function krnl_dot_contraction!(cm,st, psi, lp) @inbounds begin b = Int64(CUDA.threadIdx().x) r = Int64(CUDA.blockIdx().x) I = point_coord((b,r), lp) cm[I] = dot(st[b,r],psi[b,r]) end return nothing end function dot_contraction(st,psi) CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_dot_contraction!(complex_scalar,st, psi, lp) end return reshape(Array(CUDA.reduce(+, complex_scalar; dims=(1,2,3))),lp.iL[end]) end function krnl_gammazero_contraction!(cm, psi, lp) @inbounds begin b = Int64(CUDA.threadIdx().x) r = Int64(CUDA.blockIdx().x) I = point_coord((b,r), lp) cm[I] = dot(psi[b,r],dmul(Gamma{4},psi[b,r])) end return nothing end function gammazero_contraction(psi) CUDA.@sync begin CUDA.@cuda threads=lp.bsz blocks=lp.rsz krnl_gammazero_contraction!(complex_scalar, psi, lp) end return reshape(Array(CUDA.reduce(+, complex_scalar; dims=(1,2,3))),lp.iL[end]) end