Merge branch 'master' of igit.ific.uv.es:fernando.p.csic.es/latticegpu.jl into fix/flow_obc

349ff240 · Nicolas Lang · 094390c1 · 1eb9d5a8 · 349ff240 · 349ff240
Commit 349ff240 authored 1 month ago by Nicolas Lang
Expand all Hide whitespace changes
Inline Side-by-side

Showing

with 585 additions and 165 deletions
+585 -165
--- a/Manifest.toml
+++ b/Manifest.toml
--- a/docs/src/dirac.md
+++ b/docs/src/dirac.md
@@ -20,6 +20,16 @@ The workspace stores four fermion fields, namely `.sr`, `.sp`, `.sAp` and `.st`,
 for different purposes. If the representation is either `SU2fund` of `SU3fund`, an extra
 field with values in `U2alg`/`U3alg` is created to store the clover, used for the improvement.

+The functions using the fields allocated in [`DiracWorkspace`](@ref) are the following:
+
+- `dws.sr` : [`CG!`](@ref), [`flw_adapt`](@ref) (fermion case), [`bfl_error`](@ref)
+- `dws.st` : [`DwdagDw!`](@ref), [`bflw_step_vec!`](@ref)
+- `dws.sp` : [`CG!`](@ref), [`flw`](@ref) (fermion case), [`bflw_step!`](@ref), [`bflw_step_vec!`](@ref), [`propagator!`](@ref), [`bndpropagator!`](@ref), [`Tbndpropagator!`](@ref)
+- `dws.sAp` : [`CG!`](@ref), [`flw`](@ref) (fermion case), [`bflw_step!`](@ref), [`bflw_step_vec!`](@ref) 
+
+Note that other functions may call some of these functions, like [`flw_adapt`](@ref) depending on [`flw`](@ref), [`bflw!`](@ref) depending on [`bflw_step!`](@ref) or [`propagator!`](@ref) depending on [`CG!`](@ref). The fields used in the innermost function will also be modified by the outermost methods.
+
+
 ## Functions

 The functions [`Dw!`](@ref), [`g5Dw!`](@ref) and [`DwdagDw!`](@ref) are all related to the 

--- a/src/Dirac/Diracfields.jl
+++ b/src/Dirac/Diracfields.jl
@@ -151,16 +151,17 @@ function krnl_assign_pf_su3!(f::AbstractArray, p , lp::SpaceParm, t::Int64)
        b = Int64(CUDA.threadIdx().x)
        r = Int64(CUDA.blockIdx().x)

-            if t == 0
+        if t == 0
            f[b,r] = Spinor(map(x->SU3fund(x[b,1,r,1] + im* x[b,1,r,2],
-                                        x[b,2,r,1] + im* x[b,2,r,2],
-                                        x[b,3,r,1] + im* x[b,3,r,2]),p))
-            elseif point_time((b,r),lp) == t
+                                           x[b,2,r,1] + im* x[b,2,r,2],
+                                           x[b,3,r,1] + im* x[b,3,r,2]),p))
+        elseif point_time((b,r),lp) == t
            f[b,r] = Spinor(map(x->SU3fund(x[b,1,r,1] + im* x[b,1,r,2],
-                                        x[b,2,r,1] + im* x[b,2,r,2],
-                                        x[b,3,r,1] + im* x[b,3,r,2]),p))
-            end
-
+                                           x[b,2,r,1] + im* x[b,2,r,2],
+                                           x[b,3,r,1] + im* x[b,3,r,2]),p))
+        else
+            f[b,r] = 0.0*f[b,r]
+        end
    end

    return nothing
@@ -197,14 +198,15 @@ function krnl_assign_pf_su2!(f::AbstractArray, p , lp::SpaceParm, t::Int64)
        b = Int64(CUDA.threadIdx().x)
        r = Int64(CUDA.blockIdx().x)

-            if t == 0
+        if t == 0
            f[b,r] = Spinor(map(x->SU2fund(x[b,1,r,1] + im* x[b,1,r,2],
-                                        x[b,2,r,1] + im* x[b,2,r,2]),p))
-            elseif point_time((b,r),lp) == t
+                                           x[b,2,r,1] + im* x[b,2,r,2]),p))
+        elseif point_time((b,r),lp) == t
            f[b,r] = Spinor(map(x->SU2fund(x[b,1,r,1] + im* x[b,1,r,2],
-                                        x[b,2,r,1] + im* x[b,2,r,2]),p))
-            end
-
+                                           x[b,2,r,1] + im* x[b,2,r,2]),p))
+        else
+            f[b,r] = 0.0*f[b,r]
+        end
    end

    return nothing

--- a/src/Dirac/Diracflow.jl
+++ b/src/Dirac/Diracflow.jl
@@ -41,13 +41,13 @@ flw(U, psi, int::FlowIntr{NI,T}, ns::Int64, gp::GaugeParm, dpar::DiracParam, lp:
 """
    function backflow(psi, U, Dt, nsave::Int64, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace)

-Performs one step back in flow time for the fermion field, according to 1302.5246. The fermion field must me that of the time-slice Dt and is flowed back to the first time-slice
+Performs the integration of the adjoint flow for the fermion field, according to 1302.5246. The fermion field must me that of the time-slice Dt and is flowed back to the first time-slice
 nsave is the total number of gauge fields saved in the process

 """
-function backflow(psi, U, Dt, maxnsave::Int64, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace)
+function backflow(psi, U, Dt, maxnsave::Int64, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm,int::FlowIntr, ymws::YMworkspace, dws::DiracWorkspace)

-    int = wfl_rk3(Float64,0.01,1.0) # Default integrator, it has to be order 3 rk but in can be zfl
+     # Default integrator is  wfl_rk3(Float64,0.01,1.0), it has to be order 3 rk but in can be zfl

    @timeit "Backflow integration" begin
        @timeit "GPU to CPU" U0 = Array(U)
@@ -98,6 +98,7 @@ function backflow(psi, U, Dt, maxnsave::Int64, gp::GaugeParm, dpar::DiracParam,

    return nothing
 end
+backflow(psi, U, Dt, maxnsave::Int64, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace) = backflow(psi, U, Dt, maxnsave, gp, dpar, lp, wfl_rk3(Float64,0.01,1.0), ymws, dws)

 """
 function bflw_step!(U, psi, eps, int::FlowIntr, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace)
@@ -108,8 +109,7 @@ function bflw_step!(psi, U,  eps, int::FlowIntr, gp::GaugeParm, dpar::DiracParam

    @timeit "Backflow step" begin

-        V = copy(U)
-        V .= U
+        @timeit "GPU to CPU" V = Array(U)

        force_gauge(ymws, U, int.c0, 1, gp, lp)

@@ -131,7 +131,7 @@ function bflw_step!(psi, U,  eps, int::FlowIntr, gp::GaugeParm, dpar::DiracParam

        Nablanabla!(dws.sp, U, 0.75*2*eps*psi, dpar, dws, lp)

-        U .= V
+        @timeit "CPU to GPU" copyto!(U,V)

        force_gauge(ymws, U, int.c0, 1, gp, lp)

@@ -144,7 +144,7 @@ function bflw_step!(psi, U,  eps, int::FlowIntr, gp::GaugeParm, dpar::DiracParam
        Nablanabla!(dws.sAp, U, 2*eps*dws.sp, dpar, dws, lp)
        dws.sAp .= psi + (8/9)*dws.sAp

-        U .= V
+        @timeit "CPU to GPU" copyto!(U,V)

        Nablanabla!(psi, U, 2*eps*(dws.sAp - (8/9)*dws.sp), dpar, dws, lp)
        psi .= (1/4)*psi + dws.sp + dws.sAp
@@ -166,8 +166,9 @@ function flw_adapt(U, psi, int::FlowIntr{NI,T}, tend::T, epsini::T, gp::GaugePar
        if ns > 10
            flw(U, psi, int, 9, eps, gp, dpar, lp, ymws, dws)
            ymws.U1 .= U
+            dws.sr .= psi
            flw(U, psi, int, 1, eps, gp, dpar, lp, ymws, dws)
-            flw(ymws.U1, int, 2, eps/2, gp, lp, ymws)
+            flw(ymws.U1,dws.sr, int, 2, eps/2, gp, dpar,lp, ymws,dws)

            dt = dt - 10*eps
            nstp = nstp + 10
@@ -175,8 +176,10 @@ function flw_adapt(U, psi, int::FlowIntr{NI,T}, tend::T, epsini::T, gp::GaugePar

            # adjust step size
            ymws.U1 .= ymws.U1 ./ U
+            dws.sr .= dws.sr .- psi
            maxd = CUDA.mapreduce(dev_one, max, ymws.U1, init=zero(tend))
-            eps  = min(int.max_eps, 2*eps, int.sft_fac*eps*(int.tol/maxd)^(one(tend)/3))
+            pfdist = sqrt(CUDA.mapreduce(norm2, max, dws.sr, init=zero(tend)))
+            eps  = min(int.max_eps, 2*eps, int.sft_fac*eps*(int.tol/maxd)^(one(tend)/3),int.sft_fac*eps*(int.tol/pfdist)^(one(tend)/3))

        else
            flw(U, psi, int, ns, eps, gp, dpar, lp, ymws, dws)
@@ -205,7 +208,7 @@ flw_adapt(U, psi, int::FlowIntr{NI,T}, tend::T, gp::GaugeParm, dpar::DiracParam,

    function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D})

-Computes /`/` \\nabla^* \\nabla /`/` `si` and stores it in `si`.
+Computes /`/` \\nabla^* \\nabla /`/` `si` and stores it in `so`.

 """
 function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,BC_PERIODIC,D}) where {D}
@@ -216,6 +219,7 @@ function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Space
    end
    return nothing
 end
+
 function Nablanabla!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D},SpaceParm{4,6,BC_OPEN,D}}) where {D}
    SF_bndfix!(si,lp)
    @timeit "Laplacian" begin
@@ -238,7 +242,7 @@ function krnl_Nablanabla(so, U, si, th, lp::SpaceParm{4,6,BC_OPEN,D}) where {D}

            so[b,r] = -4*si[b,r]

-	        bu1, ru1 = up((b,r), 1, lp)
+            bu1, ru1 = up((b,r), 1, lp)
            bd1, rd1 = dw((b,r), 1, lp)
            bu2, ru2 = up((b,r), 2, lp)
            bd2, rd2 = dw((b,r), 2, lp)
@@ -313,9 +317,29 @@ function krnl_Nablanabla(so, U, si, th, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},Sp
 end


-
 export Nablanabla!, flw, backflow, flw_adapt, bflw_step!

+"""
+    function bfl_error(psi_t, psi_0, U, tend, int::FlowIntr, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace)
+
+Estimates the error of the backflow integration of `\\psi\\_t` into `\\psi\\_0` with a random noise source.
+"""
+function bfl_error(psi_t, psi_0, U, tend, int::FlowIntr, gp::GaugeParm, dpar::DiracParam, lp::SpaceParm, ymws::YMworkspace, dws::DiracWorkspace)
+
+    pfrandomize!(dws.sr,lp)
+    @timeit "GPU to CPU" V = Array(U)
+
+    R0 =  sum(dot.(psi_0,dws.sr))
+
+    flw_adapt(U, dws.sr, int, tend, int.eps_ini/2, gp, dpar, lp, ymws, dws)
+
+    R1 = sum(dot.(psi_t,dws.sr))
+    @timeit "CPU to GPU" copyto!(U,V)
+
+    return abs(R0-R1)
+end
+
+export bfl_error

 """
    function Dslash_sq!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceParm{4,6,B,D})
@@ -362,7 +386,6 @@ function Dslash_sq!(so, U, si, dpar::DiracParam, dws::DiracWorkspace, lp::SpaceP
    return nothing
 end

-
 function krnl_g5Dslsh!(so, U, si, th, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D}

    b = Int64(CUDA.threadIdx().x);  r = Int64(CUDA.blockIdx().x)
@@ -393,7 +416,6 @@ function krnl_g5Dslsh!(so, U, si, th, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},Spac
    return nothing
 end

-
 function krnl_g5Dslsh!(so, U, si, th, lp::SpaceParm{4,6,B,D}) where {D,B}

    b = Int64(CUDA.threadIdx().x);  r = Int64(CUDA.blockIdx().x)
@@ -436,8 +458,6 @@ function krnl_g5Dslsh_impr!(so, Fcsw, csw, si, lp::SpaceParm{4,6,B,D}) where {B,
    return nothing
 end

-
-
 function krnl_g5Dslsh_impr!(so, Fcsw, csw, si, lp::Union{SpaceParm{4,6,BC_SF_ORBI,D},SpaceParm{4,6,BC_SF_AFWB,D}}) where {D}

    @inbounds begin

--- a/src/LatticeGPU.jl
+++ b/src/LatticeGPU.jl
@@ -40,7 +40,7 @@ include("YM/YM.jl")
 using .YM
 export ztwist
 export YMworkspace, GaugeParm, force0_wilson!, field, field_pln, randomize!, zero!, norm2
-export force_gauge, MD!
+export force_gauge, force_gauge_flw, MD!
 export gauge_action, hamiltonian, plaquette, HMC!, OMF4!
 export Eoft_clover, Eoft_plaq, Qtop
 export FlowIntr, wfl_euler, zfl_euler, wfl_rk2, zfl_rk2, wfl_rk3, zfl_rk3
@@ -60,7 +60,7 @@ using .Dirac
 export DiracWorkspace, DiracParam
 export Dw!, g5Dw!, DwdagDw!, SF_bndfix!, Csw!, pfrandomize!, mtwmdpar
 export read_prop, save_prop, read_dpar
-export Nablanabla!, flw, backflow
+export Nablanabla!, flw, backflow, bfl_error

 include("Solvers/Solvers.jl")
 using .Solvers

--- a/src/YM/YM.jl
+++ b/src/YM/YM.jl
@@ -165,7 +165,7 @@ include("YMfields.jl")
 export randomize!, zero!, norm2

 include("YMact.jl")
-export krnl_plaq!, force_gauge, force_wilson
+export krnl_plaq!, force_gauge, force_gauge_flw, force_wilson

 include("YMhmc.jl")
 export gauge_action, hamiltonian, plaquette, HMC!, MD!

--- a/src/YM/YMact.jl
+++ b/src/YM/YMact.jl
@@ -320,6 +320,22 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG,
    return nothing
 end

+function bnd_rescale_flw!(frc1, lp::SpaceParm{N,M,BC_OPEN,D}) where {N,M,D}
+
+    @inbounds begin
+        b = Int64(CUDA.threadIdx().x)
+        r = Int64(CUDA.blockIdx().x)
+        I = point_coord((b,r), lp)
+        it = I[N]
+
+        for id in 1:N-1
+            if (((it == 1) || (it == lp.iL[4])))
+                frc1[b,id,r] = 2*frc1[b,id,r]
+            end
+        end
+    end
+    return nothing
+end

 ##
 ## SF
@@ -874,7 +890,6 @@ function krnl_force_impr_pln!(frc1, frc2, U::AbstractArray{T}, c0, c1, Ubnd, cG,
 end


-
 ##
 ## PERIODIC
 ##
@@ -1143,6 +1158,38 @@ end
 force_gauge(ymws::YMworkspace, U, c0, gp, lp) = force_gauge(ymws, U, c0, gp.cG[1], gp, lp)
 force_gauge(ymws::YMworkspace, U, gp, lp) = force_gauge(ymws, U, gp.c0, gp.cG[1], gp, lp)

+"""
+    function force_gauge_flw(ymws::YMworkspace, U, c0, cG, gp::GaugeParm, lp::SpaceParm{N,M,BC_OPEN,D})
+
+Computes the force for the gauge flow with Open Boundaries. An aditional factor two in the boundaries
+is included, see
+
+M. Luescher, S. Schaefer: "Lattice QCD with open boundary conditions and twisted-mass reweighting", Comput.Phys.Commun. 184 (2013) 519,
+
+for more details.
+
+"""
+function force_gauge_flw(ymws::YMworkspace, U, c0, cG, gp::GaugeParm, lp::SpaceParm{N,M,BC_OPEN,D}) where {NI,N,M,D}
+
+    ztw = ztwist(gp, lp)
+    if abs(c0-1) < 1.0E-10
+        @timeit "Wilson gauge force" begin
+            force_pln!(ymws.frc1, ymws.frc2, U, gp.Ubnd, cG, ztw, lp::SpaceParm)
+        end
+    else
+        @timeit "Improved gauge force" begin
+            force_pln!(ymws.frc1, ymws.frc2, U, gp.Ubnd, cG, ztw, lp::SpaceParm, c0)
+        end
+    end
+
+    CUDA.@sync begin
+        CUDA.@cuda threads=lp.bsz blocks=lp.rsz bnd_rescale_flw!(ymws.frc1,lp::SpaceParm)
+    end
+
+    return nothing
+end
+
+
 """ 
    function force_wilson(ymws::YMworkspace, U, gp::GaugeParm, lp::SpaceParm)


--- a/src/YM/YMflow.jl
+++ b/src/YM/YMflow.jl
@@ -93,7 +93,7 @@ function Base.show(io::IO, int::FlowIntr{N,T}) where {N,T}
    if N == 0
        println(io, " * Euler schem3")
    elseif N == 1
-        println(io, " * One stage scheme. Coefficients3")
+        println(io, " * One stage scheme. Coefficients")
        println(io, "    stg 1: ", int.e0[1], " ", int.e1[1])
    elseif N == 2
        println(io, " * Two stage scheme. Coefficients:")
@@ -201,6 +201,31 @@ function flw(U, int::FlowIntr{NI,T}, ns::Int64, eps, gp::GaugeParm, lp::SpacePar
 end
 flw(U, int::FlowIntr{NI,T}, ns::Int64, gp::GaugeParm, lp::SpaceParm, ymws::YMworkspace) where {NI,T} = flw(U, int, ns, int.eps, gp, lp, ymws)

+function flw(U, int::FlowIntr{NI,T}, ns::Int64, eps, gp::GaugeParm, lp::SpaceParm{N,M,BC_OPEN,D}, ymws::YMworkspace) where {NI,T,N,M,D}
+    @timeit "Integrating flow equations" begin
+        for i in 1:ns
+            force_gauge_flw(ymws, U, int.c0, 1, gp, lp)
+            if int.add_zth
+                add_zth_term(ymws::YMworkspace, U, lp)
+            end
+            ymws.mom .= ymws.frc1
+            U .= expm.(U, ymws.mom, 2*eps*int.r)
+
+            for k in 1:NI
+                force_gauge_flw(ymws, U, int.c0, 1, gp, lp)
+                if int.add_zth
+                    add_zth_term(ymws::YMworkspace, U, lp)
+                end
+                ymws.mom .= int.e0[k].*ymws.mom .+ int.e1[k].*ymws.frc1
+                U .= expm.(U, ymws.mom, 2*eps)
+            end
+        end
+    end
+
+    return nothing
+end
+flw(U, int::FlowIntr{NI,T}, ns::Int64, gp::GaugeParm, lp::SpaceParm{N,M,BC_OPEN,D}, ymws::YMworkspace) where {NI,T,N,M,D} = flw(U, int, ns, int.eps, gp, lp, ymws)
+

 ##
 # Adaptive step size integrators
@@ -320,30 +345,30 @@ Eoft_plaq(U, gp::GaugeParm{T,G,NN}, lp::SpaceParm{N,M,B,D}, ymws::YMworkspace) w


 function krnl_plaq_pln!(plx, U::AbstractArray{T}, Ubnd, ztw, ipl, lp::SpaceParm{N,M,B,D}) where {T,N,M,B,D}
-    
+
    @inbounds begin
        b = Int64(CUDA.threadIdx().x)
        r = Int64(CUDA.blockIdx().x)
        I = point_coord((b,r), lp)
-        
+
        id1, id2 = lp.plidx[ipl]
        SFBC = ((B == BC_SF_AFWB) || (B == BC_SF_ORBI)) && (id1 == N)
        TWP  = ((I[id1]==1)&&(I[id2]==1))
-        
+
        bu1, ru1 = up((b, r), id1, lp)
        bu2, ru2 = up((b, r), id2, lp)
-        
-        if SFBC && (ru1 != r)
+
+        if SFBC && (point_time((b,r),lp) == lp.iL[end])
            gt = Ubnd[id2]
        else
            gt = U[bu1,id2,ru1]
        end
-        
+
        if TWP
            plx[I] = ztw*tr(U[b,id1,r]*gt / (U[b,id2,r]*U[bu2,id1,ru2]))
        else
            plx[I] = tr(U[b,id1,r]*gt / (U[b,id2,r]*U[bu2,id1,ru2]))
-        end            
+        end
    end
    return nothing
 end

--- a/src/YM/YMsf.jl
+++ b/src/YM/YMsf.jl
@@ -92,7 +92,7 @@ end
 """
        function setbndfield(U, phi, lp::SpaceParm)

-Sets abelian boundary fields with phases `phi[1]` and `phi[2]` to the configuration `U` at time salice ``x_0=0``.
+Sets abelian boundary fields with phases `phi[1]` and `phi[2]` to the configuration `U` at time slice ``x_0=0``.
 """
 function setbndfield(U, phi, lp::SpaceParm{N,M,B,D}) where {N,M,B,D}


--- a/test/dirac/test_adapt_ferm.jl
+++ b/test/dirac/test_adapt_ferm.jl
+using LatticeGPU, Test, CUDA
+
+T = Float64
+lp = SpaceParm{4}((16,16,16,16), (4,4,4,4), BC_PERIODIC, (0,0,0,0,0,0))
+gp = GaugeParm{T}(SU3{T}, 6.1, 1.0)
+dpar = DiracParam{T}(SU3fund,1.3,0.9,(1.0,1.0,1.0,1.0),0.0,0.0)
+ymws = YMworkspace(SU3, T, lp)
+dws = DiracWorkspace(SU3fund,T,lp);
+
+randomize!(ymws.mom, lp, ymws)
+U = exp.(ymws.mom)
+
+psi = scalar_field(Spinor{4,SU3fund{T}},lp);
+pfrandomize!(psi,lp)
+
+Ucp = deepcopy(U)
+psicp = deepcopy(psi)
+# First Integrate very precisely up to t=2 (Wilson)
+println(" # Very precise integration ")
+wflw = wfl_rk3(Float64, 0.0004, 1.0E-7)
+flw(U,psi, wflw, 5000, gp,dpar, lp, ymws, dws)
+pl_exact = Eoft_plaq(U, gp, lp, ymws)
+cl_exact = Eoft_clover(U, gp, lp, ymws)
+println("  - Plaq:   ", pl_exact)
+println("  - Clover: ", cl_exact)
+Ufin = deepcopy(U)
+psifin = deepcopy(psi)
+
+
+# Now use Adaptive step size integrator:
+for tol in (1.0E-4, 1.0E-5, 1.0E-6, 1.0E-7, 1.0E-8)
+    local wflw = wfl_rk3(Float64, 0.0001, tol)
+    U .= Ucp
+    psi .= psicp
+    ns, eps = flw_adapt(U,psi, wflw, 2.0, gp,dpar,lp, ymws,dws)
+    pl = Eoft_plaq(U, gp, lp, ymws)
+    cl = Eoft_clover(U, gp, lp, ymws)
+    psierr = sum(norm2.((psi.-psifin)))./prod(lp.iL)
+
+    println(" # Adaptive integrator (tol=$tol): ", ns, " steps")
+    U .= U ./ Ufin
+    maxd = CUDA.mapreduce(dev_one, max, U, init=0.0)
+    println("  - Plaq:   ", pl," [diff: ", abs(pl-pl_exact), "; ",
+            maxd, "]")
+    println("  - Clover: ", cl, " [diff: ", abs(cl-cl_exact), "; ",
+            maxd, "]")
+    println("  - Fermion diff: ", psierr)
+end
--- a/test/dirac/test_backflow.jl
+++ b/test/dirac/test_backflow.jl
-using CUDA
+using CUDA, LatticeGPU

-using Pkg
-
-Pkg.activate("/home/fperez/Git/LGPU_fork_ferflow")
-
-using LatticeGPU
-
-lp = SpaceParm{4}((4,4,4,4),(2,2,2,2),0,(0,0,0,0,0,0));
+println(" # Consistency condition for backflow")

+lp = SpaceParm{4}((16,16,16,16), (4,4,4,4), BC_PERIODIC, (0,0,0,0,0,0))
 pso = scalar_field(Spinor{4,SU3fund{Float64}},lp);
 psi = scalar_field(Spinor{4,SU3fund{Float64}},lp);
 psi2 = scalar_field(Spinor{4,SU3fund{Float64}},lp);
@@ -19,24 +14,27 @@ int = wfl_rk3(Float64, 0.01, 1.0)

 gp = GaugeParm{Float64}(SU3{Float64},6.0,1.0,(1.0,0.0),(0.0,0.0),lp.iL)

-dpar = DiracParam{Float64}(SU3fund,1.3,0.9,(1.0,1.0,1.0,1.0),0.0)
+dpar = DiracParam{Float64}(SU3fund,1.3,0.9,(1.0,1.0,1.0,1.0),0.0,0.0)

 randomize!(ymws.mom, lp, ymws)
 U = exp.(ymws.mom);

 pfrandomize!(psi,lp)
-for L in 4:19
+for L in 10:20:210
    pso .= psi
    V = Array(U)
-    a,b = flw_adapt(U, psi, int, L*int.eps, gp,dpar, lp, ymws,dws)
+    #a,b = flw_adapt(U, psi, int, L*int.eps, gp,dpar, lp, ymws,dws)
+    flw(U, psi, int, L,int.eps, gp,dpar, lp, ymws,dws)
     # for i in 1:a
     # flw(U, psi, int, 1 ,b[i], gp, dpar, lp, ymws, dws)
     # end
    pfrandomize!(psi2,lp)

-    foo = sum(dot.(psi,psi2))# field_dot(psi,psi2,sumf,lp)
+    foo = sum(dot.(psi,psi2))
    copyto!(U,V);
-    backflow(psi2,U,L*int.eps,7,gp,dpar,lp, ymws,dws)
-    println("Error:",(sum(dot.(pso,psi2))-foo)/foo)
+    backflow(psi2,U,L*int.eps,20,gp,dpar,lp, ymws,dws)
+    println("# Consistency backflow test for t=",L*int.eps)
+    println("Relative error:",abs((sum(dot.(pso,psi2))-foo)/foo))
    psi .= pso
 end
+
--- a/test/dirac/test_backflow_tl.jl
+++ b/test/dirac/test_backflow_tl.jl
@@ -3,13 +3,14 @@ using LatticeGPU, CUDA, TimerOutputs
 #Test for the relation K(t,y;0,n)^+ Dw(n|m)^{-1} e^(ipm) = D(p)^{-1} exp(4t sin^2(p/2)) e^{ipn} with a given momenta (if p=0 its randomized), spin and color
 #Kernel en 1207.2096

+println(" # Free fermion propagator for backflow")

 @timeit "Plw backflow test" begin

    function Dwpw_test(;p=0,s=1,c=1)
        lp = SpaceParm{4}((16,16,16,16), (4,4,4,4), 0, (0,0,0,0,0,0))
        gp = GaugeParm{Float64}(SU3{Float64}, 6.0, 1.0)
-        dpar = DiracParam{Float64}(SU3fund,1.3,0.0,(1.0,1.0,1.0,1.0),0.0)
+        dpar = DiracParam{Float64}(SU3fund,1.3,0.0,(1.0,1.0,1.0,1.0),0.0,0.0)
        dws = DiracWorkspace(SU3fund,Float64,lp);
        ymws = YMworkspace(SU3,Float64,lp);

@@ -89,9 +90,7 @@ using LatticeGPU, CUDA, TimerOutputs
            g5Dw!(prop,U,pwave,dpar,dws,lp)
            CG!(prop,U,DwdagDw!,dpar,lp,dws,10000,1.0e-14)

-            for _ in 1:Nsteps
-                backflow(U,prop,1,int.eps,gp,dpar,lp, ymws,dws)
-            end
+            backflow(prop,U,Nsteps*int.eps,20,gp,dpar,lp, ymws,dws)
        end


@@ -103,15 +102,15 @@ using LatticeGPU, CUDA, TimerOutputs


    begin
-        dif = 0.0
+        global diff = 0.0
        for i in 1:3 for j in 1:4
-            dif += Dwpw_test(c=i,s=j)
+            global diff += Dwpw_test(c=i,s=j)
        end end

-        if dif < 1.0e-5
-            print("Backflow_tl test passed with average error ", dif/12,"!\n")
+        if diff < 1.0e-5
+            print("Backflow_tl test passed with average error ", diff/12,"\n")
        else
-            error("Backflow_tl test failed with difference: ",dif,"\n")
+            error("Backflow_tl test failed with difference: ",diff,"\n")
        end



--- a/test/dirac/test_flow_tl.jl
+++ b/test/dirac/test_flow_tl.jl
 using LatticeGPU, CUDA, TimerOutputs

 #Test for the relation K(t,y;0,n) Dw(n|m)^{-1} e^(ipm) = D(p)^{-1} exp(-4t sin^2(p/2)) e^{ipn} with a given momenta (if p=0 its randomized), spin and color
-#Kernel en 1207.2096
+#Kernel from 1207.2096

+println(" # Free fermion propagator for frontflow")

 @timeit "Plw flow test" begin

    function Dwpw_test(;p=0,s=1,c=1)
        lp = SpaceParm{4}((16,16,16,16), (4,4,4,4), 0, (0,0,0,0,0,0))
        gp = GaugeParm{Float64}(SU3{Float64}, 6.0, 1.0)
-        dpar = DiracParam{Float64}(SU3fund,1.3,0.0,(1.0,1.0,1.0,1.0),0.0)
+        dpar = DiracParam{Float64}(SU3fund,1.3,0.0,(1.0,1.0,1.0,1.0),0.0,0.0)
        dws = DiracWorkspace(SU3fund,Float64,lp);
        ymws = YMworkspace(SU3,Float64,lp);

@@ -103,15 +104,15 @@ using LatticeGPU, CUDA, TimerOutputs


    begin
-        dif = 0.0
+        global diff = 0.0
        for i in 1:3 for j in 1:4
-            dif += Dwpw_test(c=i,s=j)
+            global diff += Dwpw_test(c=i,s=j)
        end end

-        if dif < 1.0e-4
-            print("Flow_tl test passed with average error ", dif/12,"!\n")
+        if diff < 1.0e-4
+            print("Flow_tl test passed with average error ", diff/12,"\n")
        else
-            error("Flow_tl test failed with difference: ",dif,"\n")
+            error("Flow_tl test failed with difference: ",diff,"\n")
        end



--- a/test/dirac/test_fp_fa.jl
+++ b/test/dirac/test_fp_fa.jl
@@ -2,6 +2,8 @@ using LatticeGPU
 using CUDA
 using TimerOutputs

+println(" # Free solution for SF correlation functions")
+
 @timeit "fA_fP test" begin


@@ -115,7 +117,7 @@ using TimerOutputs
        elseif difP > 1.0e-15
            error("fP test failed with error ", difP)
        else
-            print("fA & fP tests passed with errors: ", difA," and ",difP,"!\n")
+            print("fA & fP tests passed with errors: ", difA," and ",difP,"\n")
        end

 end
--- a/test/dirac/test_solver_plw.jl
+++ b/test/dirac/test_solver_plw.jl
@@ -2,6 +2,8 @@ using LatticeGPU, CUDA, TimerOutputs

 #Test for the relation Dw(n|m)^{-1} e^(ipm) = D(p)^{-1} e^{ipn} with a given momenta (if p=0 its randomized), spin and color

+println(" # Test for free fermion propagator")
+
 @timeit "Plw test" begin

 function Dwpw_test(;p=0,s=1,c=1)
@@ -84,12 +86,12 @@ end

 dif = sum(norm2.(prop - prop_th))

-if dif > 1.0e-15
+if dif > 1.0e-7
        error("Dwpl test for s=",s,", c=",c," failed with difference: ",dif,"\n")
 end


-return dif
+return sqrt(dif)
 end


@@ -101,8 +103,8 @@ for i in 1:3 for j in 1:4
    global diff += Dwpw_test(c=i,s=j)
 end end

-if diff < 1.0e-15
-    print("Dwpl test passed with average error ", diff/12,"!\n")
+if diff < 1.0e-7
+    print("Dwpl test passed with average error ", diff/12,"\n")
 else
    error("Dwpl test failed with difference: ",diff,"\n")
 end

--- a/test/dirac/test_solver_rand.jl
+++ b/test/dirac/test_solver_rand.jl
@@ -2,6 +2,9 @@ using CUDA, LatticeGPU, TimerOutputs

 #Check that Dw ( (DwdagDw)^{-1} g5 Dw g5 ) psi = psi for random fields

+println(" # Test for the consistency of the solver")
+
+
 @timeit "Rand solver test" begin

 @timeit "Generate random fields" begin
@@ -46,7 +49,7 @@ res = sum(norm2.(rpsi-dws.sp))


 if res < 1.0e-6 
-    print("Drand test passed with ",res,"% error!\n")
+    print("Drand test passed with ",res,"% error\n")
    
 else
    error("Drand test failed with difference: ",res,"\n")

--- a/test/runtests.jl
+++ b/test/runtests.jl

-include("SAD/test_sad.jl")
+#include("SAD/test_sad.jl")
 include("flow/test_adapt.jl")
 include("dirac/test_fp_fa.jl")
 include("dirac/test_solver_plw.jl")
 include("dirac/test_solver_rand.jl")
+include("dirac/test_flow_tl.jl")
+include("dirac/test_backflow_tl.jl")
+include("dirac/test_backflow.jl")
+include("dirac/test_adapt_ferm.jl")