!{\src2tex{textfont=tt}}
!!****f* ABINIT/lobpcgwf
!! NAME
!! lobpcgwf
!!
!! FUNCTION
!! this routine updates the whole wave functions at a given k-point,
!! using the lobpcg method
!! for a given spin-polarization, from a fixed hamiltonian
!! but might also simply compute eigenvectors and eigenvalues at this k point.
!! it will also update the matrix elements of the hamiltonian.
!!
!! COPYRIGHT
!! Copyright (C) 1998-2007 ABINIT group (GZ,AR,MT)
!! this file is distributed under the terms of the
!! gnu general public license, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! for the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  dimffnl=second dimension of ffnl (1+number of derivatives)
!!  dtfil <type(datafiles_type)>=variables related to files
!!  dtset <type(dataset_type)>=all input variales for this dataset
!!  ffnl(npw,dimffnl,lmnmax,ntypat)=nonlocal form factors on basis sphere.
!!  gs_hamk <type(gs_hamiltonian_type)>=all data for the hamiltonian at k
!!  icg=shift to be applied on the location of data in the array cg
!!  igsc=shift to be applied on the location of data in the array gsc
!!  kg_k(3,npw_k)=reduced planewave coordinates.
!!  kinpw(npw)=(modified) kinetic energy for each plane wave (hartree)
!!  lmnmax=if useylm=1, max number of (l,m,n) comp. over all type of psps
!!        =if useylm=0, max number of (l,n)   comp. over all type of psps
!!  matblk=dimension of the array ph3d
!!  mcg=second dimension of the cg array
!!  mgfft=maximum size of 1d ffts
!!  mgsc=second dimension of the gsc array
!!  mpi_enreg=informations about MPI parallelization
!!  mpsang= 1+maximum angular momentum for nonlocal pseudopotentials
!!  mpssoang= 1+maximum (spin*angular momentum) for nonlocal pseudopotentials
!!  natom=number of atoms in cell.
!!  nband_k=number of bands at this k point for that spin polarization
!!  nbdblock : number of blocks
!!  npw_k=number of plane waves at this k point
!!  nspinor=number of spinorial components of the wavefunctions
!!  ntypat=number of types of atoms in unit cell.
!!  nvloc=final dimension of vlocal (usually 1, but 4 for non-collinear)
!!  n4,n5,n6 used for dimensionning of vlocal
!!  ph3d(2,npw,matblk)=3-dim structure factors, for each atom and plane wave.
!!  prtvol=control print volume and debugging output
!!  psps <type(pseudopotential_type)>=variables related to pseudopotentials
!!  use_subovl= 1 if "subovl" array is computed (see below)
!!  vlocal(n4,n5,n6,nvloc)= local potential in real space, on the augmented fft grid
!!
!! OUTPUT
!!  resid_k(nband_k)=residuals for each states
!!  subham(nband_k*(nband_k+1))=the matrix elements of h
!!  If gs_hamk%usepaw==0:
!!    gsc(2,mgsc)=<g|s|c> matrix elements (s=overlap)
!!    subvnl(nband_k*(nband_k+1)*(1-gs_hamk%usepaw))=the matrix elements of vnl
!!  If use_subovl==0:
!!    subovl(nband_k*(nband_k+1)*use_subovl)=the matrix elements of s
!!
!! SIDE EFFECTS
!!  cg(2,mcg)=updated wavefunctions
!!
!! PARENTS
!!      vtowfk
!!
!! CHILDREN
!!      timab,wrtout,xcomm_init,xsum_mpi
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

subroutine lobpcgwf(cg,dimffnl,dtfil,dtset,ffnl,gs_hamk,gsc,icg,igsc,&
&           kg_k,kinpw,lmnmax,matblk,mcg,mgfft,mgsc,mpi_enreg,mpsang,mpssoang,natom,&
&           nband_k,nbdblock,npw_k,nspinor,ntypat,nvloc,n4,n5,n6,ph3d,prtvol,&
&           psps,resid_k,subham,subovl,subvnl,use_subovl,vlocal)

 use defs_basis
 use defs_datatypes

!This section has been created automatically by the script Abilint (TD). Do not modify these by hand.
#ifdef HAVE_FORTRAN_INTERFACES
 use interfaces_01manage_mpi
 use interfaces_13nonlocal
 use interfaces_14wfs
 use interfaces_18seqpar, except_this_one => lobpcgwf
 use interfaces_lib01hidempi
#else
 use defs_xfuncmpi
#endif
!End of the abilint section

 implicit none

#if defined MPI
          include 'mpif.h'
#endif
!Arguments ------------------------------------
 type(gs_hamiltonian_type) :: gs_hamk
 integer :: dimffnl,icg,igsc,lmnmax,matblk,mcg,mgsc,mgfft,mpsang,mpssoang,n4,n5,n6
 integer :: natom,nband_k,nbdblock,npw_k,nspinor,ntypat,nvloc,prtvol,use_subovl
 type(datafiles_type) :: dtfil
 type(dataset_type) :: dtset
 type(pseudopotential_type) :: psps
 type(mpi_type) :: mpi_enreg
 integer :: kg_k(3,npw_k)
 real(dp) :: cg(2,mcg),ffnl(npw_k,dimffnl,lmnmax,ntypat),gsc(2,mgsc)
 real(dp) :: kinpw(npw_k),ph3d(2,npw_k,matblk),resid_k(nband_k)
 real(dp) :: vlocal(n4,n5,n6,nvloc)
 real(dp) :: subham(nband_k*(nband_k+1)),subvnl(nband_k*(nband_k+1)*(1-gs_hamk%usepaw))
 real(dp) :: subovl(nband_k*(nband_k+1)*use_subovl)

!Local variables-------------------------------
 integer, parameter :: tim_getghc=4
 integer :: activepsize,activersize,bblocksize,bigorder,blocksize,cgindex,choice,cpopt
 integer :: cond_try,gscindex
 integer :: iband,iblocksize,iblock,idir,ier,ierr,ii,info,ioption,ipw,ipw1,istwf_k,isubh,isubo
 integer :: iterationnumber,ivectsize
 integer :: iwavef,i1, i2, i3, i4,jwavef,lwork,maxblocksize,maxiterations,nkpg,nnlout
 integer :: old_paral_level,paw_opt,restart,rvectsize
 integer :: signs,sij_opt,spaceComm,tim_nonlop,tocomplete,vectsize
 logical :: gen_eigenpb,block_to_complete
 real(dp), allocatable :: swavef(:,:)
 real(dp) :: cgreipw,cgimipw,cscre,cvcre,chcre,dum,sq2
 real(dp) :: tsec(2),enlout(1),nonlop_dum(1,1)
 real(dp), allocatable :: blockvectorx(:,:),blockvectorax(:,:),blockvectorbx(:,:),&
       & blockvectorr(:,:),blockvectorar(:,:),blockvectorbr(:,:),&
       & blockvectorp(:,:),blockvectorap(:,:),blockvectorbp(:,:),blockvectordumm(:,:),&
       & blockvectory(:,:),blockvectorby(:,:),cwavef(:,:),dummy1(:),dummy2(:,:),&
       & gramxax(:,:),gramxar(:,:),gramxap(:,:),gramrar(:,:),gramrap(:,:),&
       & grampap(:,:),&
       & gramxbx(:,:),gramxbr(:,:),gramxbp(:,:),gramrbr(:,:),gramrbp(:,:),&
       & grampbp(:,:),gwavef(:,:),gvnlc(:,:),&
       & identity(:,:),coordx(:,:),eigen(:),lambda(:,:),&
       & grama(:,:),gramb(:,:),gramyx(:,:),&
       & kpg_dum(:,:),residualnorms(:),transf(:,:,:),w(:),work(:)
 type(cprj_type) :: cprj_dum(1)


#ifdef VMS
!DEC$ ATTRIBUTES ALIAS:'DGEMM' :: dgemm
!DEC$ ATTRIBUTES ALIAS:'DPOTRF' :: dpotrf
!DEC$ ATTRIBUTES ALIAS:'DSYEV' :: dsyev
!DEC$ ATTRIBUTES ALIAS:'DSYGV' :: dsygv
!DEC$ ATTRIBUTES ALIAS:'DTRSM' :: dtrsm
#endif

!NO_ABIRULES
!correspondence with abinit. here for real wf
!this is the index of a given band
  cgindex(iblocksize)=npw_k*nspinor*(iblocksize-1)+icg+1
  gscindex(iblocksize)=npw_k*nspinor*(iblocksize-1)+igsc+1

! *********************************************************************

!DEBUG
! write(6,*)' lobpcgwf : enter, icg= ',icg
!ENDDEBUG

 call timab(530,1,tsec)
 gen_eigenpb=(gs_hamk%usepaw==1)

  call timab(530,1,tsec)
 sq2=sqrt(2.0_dp)
 if (mpi_enreg%me_g0 == 1) then
  vectsize=2*npw_k*nspinor-1
  else
  vectsize=2*npw_k*nspinor
 end if
 rvectsize=npw_k*nspinor
 istwf_k=gs_hamk%istwf_k
 maxiterations=dtset%nline
 maxblocksize=mpi_enreg%nproc_fft

 ioption=mpi_enreg%fft_option_lob
 if (mpi_enreg%nproc_fft ==1 .and. mpi_enreg%fft_option_lob==2) ioption=1

!Big loop bands inside blocks
 do iblock=1,nbdblock

  blocksize=mpi_enreg%nproc_fft
! block_to_complete : true if the tabs with dimension blocksize need zeroes at the end
  block_to_complete=.false.;tocomplete=0
  if ((iblock == nbdblock) .and. (nbdblock * blocksize > nband_k)) then
! the last block is smaller than the others
   block_to_complete=.true.
   tocomplete=nbdblock * blocksize - nband_k-1
   blocksize=mpi_enreg%nproc_fft - (nbdblock * blocksize - nband_k)
  end if

  bblocksize=(iblock-1)*blocksize

! allocations
  allocate(blockvectorx(vectsize,blocksize),blockvectorax(vectsize,blocksize))
  allocate(blockvectorbx(vectsize,blocksize))
  allocate(blockvectorr(vectsize,blocksize),blockvectorar(vectsize,blocksize))
  allocate(blockvectorbr(vectsize,blocksize))
  allocate(blockvectorp(vectsize,blocksize),blockvectorap(vectsize,blocksize))
  allocate(blockvectorbp(vectsize,blocksize))
  allocate(blockvectordumm(vectsize,blocksize))
  allocate(gramxax(blocksize,blocksize),&
       & gramxar(blocksize,blocksize),gramxap(blocksize,blocksize),&
       & gramrar(blocksize,blocksize),gramrap(blocksize,blocksize),&
       & grampap(blocksize,blocksize),&
       & gramxbx(blocksize,blocksize),gramxbr(blocksize,blocksize),&
       & gramxbp(blocksize,blocksize),gramrbr(blocksize,blocksize),&
       & gramrbp(blocksize,blocksize),&
       & grampbp(blocksize,blocksize))
  allocate(transf(blocksize,blocksize,3))
  allocate(lambda(blocksize,blocksize))
  allocate(residualnorms(blocksize))

  allocate(blockvectory(vectsize,bblocksize),blockvectorby(vectsize,bblocksize))
  allocate(gramyx(bblocksize,blocksize))

!transfer array of wf coeff in iblock to blockvectorx
 do iblocksize=1,blocksize
  iband=iblocksize+bblocksize
  if (mpi_enreg%me_g0 == 1) then
   blockvectorx(1,iblocksize)=cg(1,cgindex(iband))
   blockvectorx(2:rvectsize,iblocksize)=cg(1,cgindex(iband)+1:cgindex(iband+1)-1)*sq2
   blockvectorx(rvectsize+1:vectsize,iblocksize)=cg(2,cgindex(iband)+1:cgindex(iband+1)-1)*sq2
  else
   blockvectorx(1:rvectsize,iblocksize)=cg(1,cgindex(iband):cgindex(iband+1)-1)*sq2
   blockvectorx(rvectsize+1:vectsize,iblocksize)=cg(2,cgindex(iband):cgindex(iband+1)-1)*sq2
  end if
 end do

!transfer array of wf coeff less than iblock to blockvectory
 do iblocksize=1,bblocksize
  iband=iblocksize
  if (mpi_enreg%me_g0 == 1) then
   blockvectory(1,iblocksize)=cg(1,cgindex(iband))
   blockvectory(2:rvectsize,iblocksize)=cg(1,cgindex(iband)+1:cgindex(iband+1)-1)*sq2
   blockvectory(rvectsize+1:vectsize,iblocksize)=cg(2,cgindex(iband)+1:cgindex(iband+1)-1)*sq2
   else
   blockvectory(1:rvectsize,iblocksize)=cg(1,cgindex(iband):cgindex(iband+1)-1)*sq2
   blockvectory(rvectsize+1:vectsize,iblocksize)=cg(2,cgindex(iband):cgindex(iband+1)-1)*sq2
  end if
 end do

 if(iblock /=1) then !b-orthogonalize x to the constraint y(supposed b-orthonormal)
!  blockvectorx=blockvectorx-&
!              &matmul(blockvectory,matmul(transpose(blockvectory),blockvectorx))
!    call operators(blockvectory,blockvectorby)

  if(gen_eigenpb) then
   allocate(cwavef(2,npw_k*nspinor))
   allocate(gwavef(2,npw_k*nspinor))
   do iblocksize=1,bblocksize
    if (mpi_enreg%me_g0 == 1) then
     cwavef(1,2:npw_k*nspinor)=blockvectory(2:npw_k*nspinor,iblocksize)/sq2
     cwavef(2,2:npw_k*nspinor)=blockvectory(npw_k*nspinor+1:2*npw_k*nspinor-1,iblocksize)/sq2
     cwavef(1,1)=blockvectory(1,iblocksize)
     cwavef(2,1)=zero
    else
     cwavef(1,1:npw_k*nspinor)=blockvectory(1:npw_k*nspinor,iblocksize)/sq2
     cwavef(2,1:npw_k*nspinor)=blockvectory(npw_k*nspinor+1:2*npw_k*nspinor,iblocksize)/sq2
    end if
!   Call to nonlop: compute <g|S|c>
    choice=1 ; signs=2 ; idir=0 ; tim_nonlop=1 ; cpopt=-1 ; paw_opt=3 ; nnlout=0 ; nkpg=0
    call nonlop(gs_hamk%atindx1,choice,cpopt,cprj_dum,gs_hamk%dimekb1,0,dimffnl,dimffnl,dummy2,&
&               dummy1,ffnl,ffnl,gs_hamk%gmet,gs_hamk%gprimd,idir,gs_hamk%indlmn,&
&               istwf_k,kg_k,kg_k,kpg_dum,kpg_dum,gs_hamk%kpoint,gs_hamk%kpoint,dum,lmnmax,matblk,&
&               mgfft,mpi_enreg,mpsang,mpssoang,natom,gs_hamk%nattyp,gs_hamk%ngfft,nkpg,nkpg,&
&               gs_hamk%nloalg,nnlout,npw_k,npw_k,nspinor,ntypat,paw_opt,gs_hamk%phkxred,&
&               gs_hamk%phkxred,gs_hamk%ph1d,ph3d,ph3d,gs_hamk%pspso,signs,gs_hamk%sij,&
&               gwavef,tim_nonlop,gs_hamk%ucvol,gs_hamk%useylm,cwavef,cwavef)
    if (mpi_enreg%me_g0 == 1) then
     blockvectorby(2:npw_k*nspinor,iblocksize)=gwavef(1,2:npw_k*nspinor)*sq2
     blockvectorby(npw_k*nspinor+1:2*npw_k*nspinor-1,iblocksize)=gwavef(2,2:npw_k*nspinor)*sq2
     blockvectorby(1,iblocksize)=gwavef(1,1)
     else
     blockvectorby(1:npw_k*nspinor,iblocksize)=gwavef(1,1:npw_k*nspinor)*sq2
     blockvectorby(npw_k*nspinor+1:2*npw_k*nspinor,iblocksize)=gwavef(2,1:npw_k*nspinor)*sq2
    end if
   end do
   deallocate(cwavef,gwavef)
   else
    blockvectorby(:,:)=blockvectory(:,:)
  end if

  call dgemm('t','n',bblocksize,blocksize,vectsize,one,blockvectorby,&
&               vectsize,blockvectorx,vectsize,zero,gramyx,bblocksize)
  old_paral_level= mpi_enreg%paral_level
  mpi_enreg%paral_level=3
  call xcomm_init(mpi_enreg,spaceComm)
  call timab(48,1,tsec)

!FB  call timab(541,1,tsec)

  call xsum_mpi(gramyx,spaceComm,ierr)
  call timab(48,2,tsec)

!FB  call timab(541,2,tsec)

  mpi_enreg%paral_level= old_paral_level
  call dgemm('n','n',vectsize,blocksize,bblocksize,one,blockvectory,&
&               vectsize,gramyx,bblocksize,zero,blockvectordumm,vectsize)
  blockvectorx=blockvectorx-blockvectordumm
 end if
!compute right hand side
!call operators(blockvectorx,blockvectorbx)
!orthogonalize x

 if(gen_eigenpb) then
  allocate(cwavef(2,npw_k*nspinor))
  allocate(gwavef(2,npw_k*nspinor))
  do iblocksize=1,blocksize
   if (mpi_enreg%me_g0 == 1) then
    cwavef(1,2:npw_k*nspinor)=blockvectorx(2:npw_k*nspinor,iblocksize)/sq2
    cwavef(2,2:npw_k*nspinor)=blockvectorx(npw_k*nspinor+1:2*npw_k*nspinor-1,iblocksize)/sq2
    cwavef(1,1)=blockvectorx(1,iblocksize)
    cwavef(2,1)=zero
   else
    cwavef(1,1:npw_k*nspinor)=blockvectorx(1:npw_k*nspinor,iblocksize)/sq2
    cwavef(2,1:npw_k*nspinor)=blockvectorx(npw_k*nspinor+1:2*npw_k*nspinor,iblocksize)/sq2
   end if
!  Call to nonlop: compute <g|S|c>
   choice=1 ; signs=2 ; idir=0 ; tim_nonlop=1 ; cpopt=-1 ; paw_opt=3 ; nnlout=0 ; nkpg=0
   call nonlop(gs_hamk%atindx1,choice,cpopt,cprj_dum,gs_hamk%dimekb1,0,dimffnl,dimffnl,dummy2,&
&              dummy1,ffnl,ffnl,gs_hamk%gmet,gs_hamk%gprimd,idir,gs_hamk%indlmn,&
&              istwf_k,kg_k,kg_k,kpg_dum,kpg_dum,gs_hamk%kpoint,gs_hamk%kpoint,dum,lmnmax,matblk,&
&              mgfft,mpi_enreg,mpsang,mpssoang,natom,gs_hamk%nattyp,gs_hamk%ngfft,nkpg,nkpg,&
&              gs_hamk%nloalg,nnlout,npw_k,npw_k,nspinor,ntypat,paw_opt,gs_hamk%phkxred,&
&              gs_hamk%phkxred,gs_hamk%ph1d,ph3d,ph3d,gs_hamk%pspso,signs,gs_hamk%sij,&
&              gwavef,tim_nonlop,gs_hamk%ucvol,gs_hamk%useylm,cwavef,cwavef)
   if (mpi_enreg%me_g0 == 1) then
    blockvectorbx(2:npw_k*nspinor,iblocksize)=gwavef(1,2:npw_k*nspinor)*sq2
    blockvectorbx(npw_k*nspinor+1:2*npw_k*nspinor-1,iblocksize)=gwavef(2,2:npw_k*nspinor)*sq2
    blockvectorbx(1,iblocksize)=gwavef(1,1)
    else
    blockvectorbx(1:npw_k*nspinor,iblocksize)=gwavef(1,1:npw_k*nspinor)*sq2
    blockvectorbx(npw_k*nspinor+1:2*npw_k*nspinor,iblocksize)=gwavef(2,1:npw_k*nspinor)*sq2
   end if
  end do
  deallocate(cwavef,gwavef)
 else
  blockvectorbx(:,:)=blockvectorx(:,:)
 end if
!call orthonormalize(blockvectorx,blockvectorbx)
!b-orthonormalize x and construct the sqrt of the gram matrix gramxbx
 call orthonormalize(blockvectorx,blockvectorbx,blocksize,mpi_enreg,gramxbx,vectsize)
 call dtrsm('r','u','n','n',vectsize,blocksize,one,gramxbx,blocksize,&
       &              blockvectorbx,vectsize)

!call operatorh(blockvectorx,blockvectorax)
 allocate(cwavef(2,npw_k*nspinor*maxblocksize))
 allocate(gwavef(2,npw_k*nspinor*maxblocksize),gvnlc(2,npw_k*nspinor*maxblocksize))
 allocate(swavef(2,npw_k*nspinor*maxblocksize))
 if (block_to_complete) then
  cwavef(:,:)=zero
 end if
 do iblocksize=1,blocksize
  iband=iblocksize
  if (mpi_enreg%me_g0 == 1) then
   cwavef(1,cgindex(iband)+1:cgindex(iband+1)-1)=blockvectorx(2:npw_k*nspinor,iblocksize)/sq2
   cwavef(2,cgindex(iband)+1:cgindex(iband+1)-1)=blockvectorx(npw_k*nspinor+1:2*npw_k*nspinor-1,iblocksize)/sq2
   cwavef(2,cgindex(iband))=zero
   cwavef(1,cgindex(iband))=blockvectorx(1,iblocksize)
   else
   cwavef(1,cgindex(iband):cgindex(iband+1)-1)=blockvectorx(1:npw_k*nspinor,iblocksize)/sq2
   cwavef(2,cgindex(iband):cgindex(iband+1)-1)=blockvectorx(npw_k*nspinor+1:2*npw_k*nspinor,iblocksize)/sq2
  end if
 end do
 sij_opt=0
 if (ioption==1) then
  call getghc(cwavef,dimffnl,ffnl,dtfil%filstat,gwavef,dummy2,gs_hamk,gvnlc,kg_k,&
&  kinpw,lmnmax,matblk,mgfft,mpi_enreg,mpsang,mpssoang,natom,blocksize,npw_k,nspinor,ntypat,&
&  nvloc,n4,n5,n6,ph3d,prtvol,sij_opt,tim_getghc,vlocal)
 else
  call prep_getghc(cwavef,dimffnl,dtfil,ffnl,gs_hamk,gvnlc,gwavef,swavef,iblock,1,istwf_k,kg_k,&
&  kinpw,lmnmax,matblk,maxblocksize,mgfft,mpi_enreg,mpsang,mpssoang,natom,nbdblock,&
&  nband_k,npw_k,nspinor,ntypat,nvloc,n4,n5,n6,ph3d,prtvol,sij_opt,vlocal)
 end if

 do iblocksize=1,blocksize
  iband=iblocksize
  if (mpi_enreg%me_g0 == 1) then
   blockvectorax(2:npw_k*nspinor,iblocksize)=gwavef(1,cgindex(iband)+1:cgindex(iband+1)-1)*sq2
   blockvectorax(npw_k*nspinor+1:2*npw_k*nspinor-1,iblocksize)=gwavef(2,cgindex(iband)+1:cgindex(iband+1)-1)*sq2
   blockvectorax(1,iblocksize)=gwavef(1,cgindex(iband))
   else
   blockvectorax(1:npw_k*nspinor,iblocksize)=gwavef(1,cgindex(iband):cgindex(iband+1)-1)*sq2
   blockvectorax(npw_k*nspinor+1:2*npw_k*nspinor,iblocksize)=gwavef(2,cgindex(iband):cgindex(iband+1)-1)*sq2
  end if
 end do

 deallocate(cwavef,gwavef,gvnlc)
 deallocate(swavef)
!do rayleigh ritz on a in space x
!gramxax=matmul(transpose(blockvectorx),blockvectorax)

 call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorx,&
&               vectsize,blockvectorax,vectsize,zero,gramxax,blocksize)
 old_paral_level= mpi_enreg%paral_level
 mpi_enreg%paral_level=3
 call xcomm_init(mpi_enreg,spaceComm)
 call timab(48,1,tsec)

!FB call timab(541,1,tsec)

 call xsum_mpi(gramxax,spaceComm,ierr)
 call timab(48,2,tsec)

!FB call timab(541,2,tsec)

 mpi_enreg%paral_level= old_paral_level
 allocate(eigen(blocksize))
 lwork=3*blocksize
 allocate(work(lwork))
 call dsyev('V','U',blocksize,gramxax,blocksize,eigen,work,lwork,info)
 deallocate(work)
!blockvectorx=matmul(blockvectorx,gramxax)
 call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorx,&
&               vectsize,gramxax,blocksize,zero,blockvectordumm,vectsize)
 blockvectorx=blockvectordumm
!blockvectorax=matmul(blockvectorax,gramxax)
 call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorax,&
&               vectsize,gramxax,blocksize,zero,blockvectordumm,vectsize)
 blockvectorax=blockvectordumm
!blockvectorbx=matmul(blockvectorbx,gramxax)
 call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorbx,&
&               vectsize,gramxax,blocksize,zero,blockvectordumm,vectsize)
 blockvectorbx=blockvectordumm
 do iblocksize=1,blocksize
  lambda(iblocksize,iblocksize)=eigen(iblocksize)
 end do
!DEBUG
! write(6,*)'lambda',eigen
!ENDDEBUG
 deallocate(eigen)
!now the main alogrithm
 iter: do iterationnumber=1,maxiterations
!DEBUG
!  write(6,*)'iterationnumber',iterationnumber
!ENDDEBUG
!construct residual
!blockvectorr=blockvectorax-matmul(blockvectorx,lambda)
  do iblocksize=1,blocksize
   call precon2(blockvectorbx(:,iblocksize),lambda(iblocksize,iblocksize),&
&                 istwf_k,kinpw,mpi_enreg,npw_k,nspinor,&
&                 blockvectorax(:,iblocksize),blockvectorr(:,iblocksize),vectsize)
  end do
  residualnorms=sum(blockvectorr**2,dim=1)
  old_paral_level= mpi_enreg%paral_level
  mpi_enreg%paral_level=3
  call xcomm_init(mpi_enreg,spaceComm)
  call timab(48,1,tsec)

!FB  call timab(541,1,tsec)

  call xsum_mpi(residualnorms,spaceComm,ierr)
  call timab(48,2,tsec)

!FB  call timab(541,2,tsec)

  mpi_enreg%paral_level= old_paral_level
  resid_k(bblocksize+1:bblocksize+blocksize)=residualnorms(1:blocksize)
  residualnorms=sqrt(residualnorms)
  write(6,*)'residualnorm',residualnorms
!if(sum(residualnorms) < 1.d-10) exit
!not yet masking
  if(iblock /=1) then !residuals orthogonal to blockvectory
!   blockvectorr=blockvectorr-&
!           &matmul(blockvectory,matmul(transpose(blockvectorby),blockvectorr))
   call dgemm('t','n',bblocksize,blocksize,vectsize,one,blockvectorby,&
&               vectsize,blockvectorr,vectsize,zero,gramyx,bblocksize)
   old_paral_level= mpi_enreg%paral_level
   mpi_enreg%paral_level=3
   call xcomm_init(mpi_enreg,spaceComm)
   call timab(48,1,tsec)

!FB   call timab(541,1,tsec)

   call xsum_mpi(gramyx,spaceComm,ierr)
   call timab(48,2,tsec)

!FB   call timab(541,2,tsec)

   mpi_enreg%paral_level= old_paral_level
   call dgemm('n','n',vectsize,blocksize,bblocksize,one,blockvectory,&
&               vectsize,gramyx,bblocksize,zero,blockvectordumm,vectsize)
   blockvectorr=blockvectorr-blockvectordumm
  end if
!residuals orthogonal to blockvectorx
!     blockvectorr=blockvectorr-&
!          &matmul(blockvectorx,matmul(transpose(blockvectorbx),blockvectorr))
  call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorbx,&
&               vectsize,blockvectorr,vectsize,zero,gramxax,blocksize)
  old_paral_level= mpi_enreg%paral_level
  mpi_enreg%paral_level=3
  call xcomm_init(mpi_enreg,spaceComm)
  call timab(48,1,tsec)

!FB  call timab(541,1,tsec)

  call xsum_mpi(gramxax,spaceComm,ierr)
  call timab(48,2,tsec)

!FB  call timab(541,2,tsec)

  mpi_enreg%paral_level= old_paral_level
  call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorx,&
&               vectsize,gramxax,blocksize,zero,blockvectordumm,vectsize)
  blockvectorr=blockvectorr-blockvectordumm
!and now (b)orthornormalize r
!call operators(blockvectorr,blockvectorbr)
  if(gen_eigenpb) then
   allocate(cwavef(2,npw_k*nspinor))
   allocate(gwavef(2,npw_k*nspinor))
   do iblocksize=1,blocksize
    if (mpi_enreg%me_g0 == 1) then
     cwavef(1,2:npw_k*nspinor)=blockvectorr(2:npw_k*nspinor,iblocksize)/sq2
     cwavef(2,2:npw_k*nspinor)=blockvectorr(npw_k*nspinor+1:2*npw_k*nspinor-1,iblocksize)/sq2
     cwavef(1,1)=blockvectorr(1,iblocksize)
     cwavef(2,1)=zero
    else
     cwavef(1,1:npw_k*nspinor)=blockvectorr(1:npw_k*nspinor,iblocksize)/sq2
     cwavef(2,1:npw_k*nspinor)=blockvectorr(npw_k*nspinor+1:2*npw_k*nspinor,iblocksize)/sq2
    end if
!   Call to nonlop: compute <g|S|c>
    choice=1 ; signs=2 ; idir=0 ; tim_nonlop=1 ; cpopt=-1 ; paw_opt=3 ; nnlout=0 ; nkpg=0
    call nonlop(gs_hamk%atindx1,choice,cpopt,cprj_dum,gs_hamk%dimekb1,0,dimffnl,dimffnl,dummy2,&
&               dummy1,ffnl,ffnl,gs_hamk%gmet,gs_hamk%gprimd,idir,gs_hamk%indlmn,&
&               istwf_k,kg_k,kg_k,kpg_dum,kpg_dum,gs_hamk%kpoint,gs_hamk%kpoint,dum,lmnmax,matblk,&
&               mgfft,mpi_enreg,mpsang,mpssoang,natom,gs_hamk%nattyp,gs_hamk%ngfft,nkpg,nkpg,&
&               gs_hamk%nloalg,nnlout,npw_k,npw_k,nspinor,ntypat,paw_opt,gs_hamk%phkxred,&
&               gs_hamk%phkxred,gs_hamk%ph1d,ph3d,ph3d,gs_hamk%pspso,signs,gs_hamk%sij,&
&               gwavef,tim_nonlop,gs_hamk%ucvol,gs_hamk%useylm,cwavef,cwavef)
    if (mpi_enreg%me_g0 == 1) then
     blockvectorbr(2:npw_k*nspinor,iblocksize)=gwavef(1,2:npw_k*nspinor)*sq2
     blockvectorbr(npw_k*nspinor+1:2*npw_k*nspinor-1,iblocksize)=gwavef(2,2:npw_k*nspinor)*sq2
     blockvectorbr(1,iblocksize)=gwavef(1,1)
    else
     blockvectorbr(1:npw_k*nspinor,iblocksize)=gwavef(1,1:npw_k*nspinor)*sq2
     blockvectorbr(npw_k*nspinor+1:2*npw_k*nspinor,iblocksize)=gwavef(2,1:npw_k*nspinor)*sq2
    end if
   end do
   deallocate(cwavef,gwavef)
   else
   blockvectorbr(:,:)=blockvectorr(:,:)
  end if
 !call orthonormalize(blockvectorr,blockvectorbr)
  call orthonormalize(blockvectorr,blockvectorbr,blocksize,mpi_enreg,gramrbr,vectsize)
 call dtrsm('r','u','n','n',vectsize,blocksize,one,gramrbr,blocksize,&
&          blockvectorbr,vectsize)
!compute ar
!call operatorh(blockvectorr,blockvectorar)
  allocate(cwavef(2,npw_k*nspinor*maxblocksize))
  allocate(gwavef(2,npw_k*nspinor*maxblocksize),gvnlc(2,npw_k*nspinor*maxblocksize))
  allocate(swavef(2,npw_k*nspinor*maxblocksize))
  if (block_to_complete) then
   cwavef(:,:)=zero
  end if
  do iblocksize=1,blocksize
  iband=iblocksize
  if (mpi_enreg%me_g0 == 1) then
   cwavef(1,cgindex(iband)+1:cgindex(iband+1)-1)=blockvectorr(2:npw_k*nspinor,iblocksize)/sq2
   cwavef(2,cgindex(iband)+1:cgindex(iband+1)-1)=blockvectorr(npw_k*nspinor+1:2*npw_k*nspinor-1,iblocksize)/sq2
   cwavef(2,cgindex(iband))=zero
   cwavef(1,cgindex(iband))=blockvectorr(1,iblocksize)
   else
   cwavef(1,cgindex(iband):cgindex(iband+1)-1)=blockvectorr(1:npw_k*nspinor,iblocksize)/sq2
   cwavef(2,cgindex(iband):cgindex(iband+1)-1)=blockvectorr(npw_k*nspinor+1:2*npw_k*nspinor,iblocksize)/sq2
  end if
 end do
 sij_opt=0
 if (ioption==1) then
  call getghc(cwavef,dimffnl,ffnl,dtfil%filstat,gwavef,dummy2,gs_hamk,gvnlc,kg_k,&
& kinpw,lmnmax,matblk,mgfft,mpi_enreg,mpsang,mpssoang,natom,blocksize,npw_k,nspinor,ntypat,&
& nvloc,n4,n5,n6,ph3d,prtvol,sij_opt,tim_getghc,vlocal)
  else
  call prep_getghc(cwavef,dimffnl,dtfil,ffnl,gs_hamk,gvnlc,gwavef,swavef,iblock,2,istwf_k,kg_k,&
  & kinpw,lmnmax,matblk,maxblocksize,mgfft,&
& mpi_enreg,mpsang,mpssoang,natom,nbdblock,nband_k,npw_k,nspinor,ntypat,nvloc,n4,n5,n6,&
& ph3d,prtvol,sij_opt,vlocal)
 end if
 do iblocksize=1,blocksize
  iband=iblocksize
  if (mpi_enreg%me_g0 == 1) then
   blockvectorar(2:npw_k*nspinor,iblocksize)=gwavef(1,cgindex(iband)+1:cgindex(iband+1)-1)*sq2
   blockvectorar(npw_k*nspinor+1:2*npw_k*nspinor-1,iblocksize)=gwavef(2,cgindex(iband)+1:cgindex(iband+1)-1)*sq2
   blockvectorar(1,iblocksize)=gwavef(1,cgindex(iband))
  else
   blockvectorar(1:npw_k*nspinor,iblocksize)=gwavef(1,cgindex(iband):cgindex(iband+1)-1)*sq2
   blockvectorar(npw_k*nspinor+1:2*npw_k*nspinor,iblocksize)=gwavef(2,cgindex(iband):cgindex(iband+1)-1)*sq2
  end if
 end do
 deallocate(cwavef,gwavef,gvnlc)
 deallocate(swavef)
  if(iterationnumber>1) then
!call orthonormalize(blockvectorp,blockvectorbp,blockvectorap)
   call orthonormalize(blockvectorp,blockvectorbp,blocksize,mpi_enreg,grampbp,vectsize)
   call dtrsm('r','u','n','n',vectsize,blocksize,one,grampbp,blocksize,&
       &              blockvectorbp,vectsize)
   call dtrsm('r','u','n','n',vectsize,blocksize,one,grampbp,blocksize,&
       &              blockvectorap,vectsize)
!blockvectorap=matmul(blockvectorap,grampbp)
  end if
  activersize=blocksize
  if (iterationnumber==1) then
   activepsize=0
   restart=1
   else
   activepsize=blocksize
   restart=0
  end if
!gramxar=matmul(transpose(blockvectorax),blockvectorr)
!gramrar=matmul(transpose(blockvectorar),blockvectorr)
!gramxax=matmul(transpose(blockvectorax),blockvectorx)
  call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorax,&
&               vectsize,blockvectorr,vectsize,zero,gramxar,blocksize)
  call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorar,&
&               vectsize,blockvectorr,vectsize,zero,gramrar,blocksize)
  call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorax,&
&               vectsize,blockvectorx,vectsize,zero,gramxax,blocksize)
  old_paral_level= mpi_enreg%paral_level
  mpi_enreg%paral_level=3
  call xcomm_init(mpi_enreg,spaceComm)
  transf(:,:,1)=gramxar(:,:)
  transf(:,:,2)=gramrar(:,:)
  transf(:,:,3)=gramxax(:,:)
  call timab(48,1,tsec)

!FB  call timab(541,1,tsec)

  call xsum_mpi(transf,spaceComm,ierr)
  call timab(48,2,tsec)

!FB  call timab(541,2,tsec)

  gramxar(:,:)=transf(:,:,1)
  gramrar(:,:)=transf(:,:,2)
  gramxax(:,:)=transf(:,:,3)
  mpi_enreg%paral_level= old_paral_level
!gramxbx=matmul(transpose(blockvectorbx),blockvectorx)
!gramrbr=matmul(transpose(blockvectorbr),blockvectorr)
!gramxbr=matmul(transpose(blockvectorbx),blockvectorr)
  call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorbx,&
&               vectsize,blockvectorx,vectsize,zero,gramxbx,blocksize)
  call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorbr,&
&               vectsize,blockvectorr,vectsize,zero,gramrbr,blocksize)
  call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorbx,&
&               vectsize,blockvectorr,vectsize,zero,gramxbr,blocksize)
  old_paral_level= mpi_enreg%paral_level
  mpi_enreg%paral_level=3
  call xcomm_init(mpi_enreg,spaceComm)
  transf(:,:,1)=gramxbx(:,:)
  transf(:,:,2)=gramrbr(:,:)
  transf(:,:,3)=gramxbr(:,:)
  call timab(48,1,tsec)

!FB  call timab(541,1,tsec)

  call xsum_mpi(transf,spaceComm,ierr)
  call timab(48,2,tsec)

!FB  call timab(541,2,tsec)

  gramxbx(:,:)=transf(:,:,1)
  gramrbr(:,:)=transf(:,:,2)
  gramxbr(:,:)=transf(:,:,3)
  mpi_enreg%paral_level= old_paral_level
  i1=0;i2=blocksize;i3=2*blocksize;i4=3*blocksize
  cond: do cond_try=1,1 !2 when restart, but not implemented
   if (restart==0) then
!gramxap=matmul(transpose(blockvectorax),blockvectorp)
!gramrap=matmul(transpose(blockvectorar),blockvectorp)
!grampap=matmul(transpose(blockvectorap),blockvectorp)
     call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorax,&
&               vectsize,blockvectorp,vectsize,zero,gramxap,blocksize)

     call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorar,&
&               vectsize,blockvectorp,vectsize,zero,gramrap,blocksize)

     call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorap,&
&               vectsize,blockvectorp,vectsize,zero,grampap,blocksize)
     old_paral_level= mpi_enreg%paral_level
     mpi_enreg%paral_level=3
     call xcomm_init(mpi_enreg,spaceComm)
     transf(:,:,1)=gramxap(:,:)
     transf(:,:,2)=gramrap(:,:)
     transf(:,:,3)=grampap(:,:)
     call timab(48,1,tsec)

!FB     call timab(541,1,tsec)

     call xsum_mpi(transf,spaceComm,ierr)
     call timab(48,2,tsec)

!FB     call timab(541,2,tsec)

     gramxap(:,:)=transf(:,:,1)
     gramrap(:,:)=transf(:,:,2)
     grampap(:,:)=transf(:,:,3)
     mpi_enreg%paral_level= old_paral_level
     bigorder=i4
     allocate(grama(i4,i4),gramb(i4,i4),eigen(i4),coordx(i4,blocksize))
     grama(i1+1:i2,i1+1:i2)=gramxax
     grama(i1+1:i2,i2+1:i3)=gramxar
     grama(i1+1:i2,i3+1:i4)=gramxap
     grama(i2+1:i3,i1+1:i2)=transpose(gramxar)
     grama(i2+1:i3,i2+1:i3)=gramrar
     grama(i2+1:i3,i3+1:i4)=gramrap
     grama(i3+1:i4,i1+1:i2)=transpose(gramxap)
     grama(i3+1:i4,i2+1:i3)=transpose(gramrap)
     grama(i3+1:i4,i3+1:i4)=grampap

!gramxbp=matmul(transpose(blockvectorbx),blockvectorp)
!gramrbp=matmul(transpose(blockvectorbr),blockvectorp)
!grampbp=matmul(transpose(blockvectorbp),blockvectorp)
     call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorbx,&
&               vectsize,blockvectorp,vectsize,zero,gramxbp,blocksize)

     call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorbr,&
&               vectsize,blockvectorp,vectsize,zero,gramrbp,blocksize)
     call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorbp,&
&               vectsize,blockvectorp,vectsize,zero,grampbp,blocksize)
     old_paral_level= mpi_enreg%paral_level
     mpi_enreg%paral_level=3
     call xcomm_init(mpi_enreg,spaceComm)
     transf(:,:,1)=gramxbp(:,:)
     transf(:,:,2)=gramrbp(:,:)
     transf(:,:,3)=grampbp(:,:)
     call timab(48,1,tsec)

!FB     call timab(541,1,tsec)

     call xsum_mpi(transf,spaceComm,ierr)
     call timab(48,2,tsec)

!FB     call timab(541,2,tsec)

     gramxbp(:,:)=transf(:,:,1)
     gramrbp(:,:)=transf(:,:,2)
     grampbp(:,:)=transf(:,:,3)
     mpi_enreg%paral_level= old_paral_level
     gramb(i1+1:i2,i1+1:i2)=gramxbx
     gramb(i1+1:i2,i2+1:i3)=gramxbr
     gramb(i1+1:i2,i3+1:i4)=gramxbp
     gramb(i2+1:i3,i1+1:i2)=transpose(gramxbr)
     gramb(i2+1:i3,i2+1:i3)=gramrbr
     gramb(i2+1:i3,i3+1:i4)=gramrbp
     gramb(i3+1:i4,i1+1:i2)=transpose(gramxbp)
     gramb(i3+1:i4,i2+1:i3)=transpose(gramrbp)
     gramb(i3+1:i4,i3+1:i4)=grampbp
     else
     bigorder=i3
     allocate(grama(i3,i3),gramb(i3,i3),eigen(i3),coordx(i3,blocksize))
     grama(i1+1:i2,i1+1:i2)=gramxax
     grama(i1+1:i2,i2+1:i3)=gramxar
     grama(i2+1:i3,i1+1:i2)=transpose(gramxar)
     grama(i2+1:i3,i2+1:i3)=gramrar
     gramb(i1+1:i2,i1+1:i2)=gramxbx
     gramb(i1+1:i2,i2+1:i3)=gramxbr
     gramb(i2+1:i3,i1+1:i2)=transpose(gramxbr)
     gramb(i2+1:i3,i2+1:i3)=gramrbr
    end if
   end do cond
   lwork=3*bigorder
   allocate(work(lwork))
   call dsygv(1,'V','U',bigorder,grama,bigorder,gramb,bigorder,eigen,work,lwork,info)
   deallocate(work)
   do iblocksize=1,blocksize
    lambda(iblocksize,iblocksize)=eigen(iblocksize)
   end do
!DEBUG
!   write(6,*)'eigen',eigen(1:blocksize)
!ENDDEBUG
   if (restart==0 .and. iterationnumber >1) then
!DEBUG
!    write(6,*)restart,iterationnumber
!ENDDEBUG
   end if
   coordx(1:bigorder,1:blocksize)=grama(1:bigorder,1:blocksize)
   deallocate(grama,gramb,eigen)
!DEBUG
!   write(6,*)restart,iterationnumber
!ENDDEBUG
   if (restart==0 .and. iterationnumber >1) then
!        blockvectorp=matmul(blockvectorr,coordx(i2+1:i3,:))+&
!             &matmul(blockvectorp,coordx(i3+1:i4,:))
    call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorr,&
&               vectsize,coordx(i2+1:i3,:),blocksize,zero,blockvectordumm,vectsize)
    call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorp,&
&               vectsize,coordx(i3+1:i4,:),blocksize,one,blockvectordumm,vectsize)
    blockvectorp=blockvectordumm
!        blockvectorap=matmul(blockvectorar,coordx(i2+1:i3,:))+&
!             &matmul(blockvectorap,coordx(i3+1:i4,:))
    call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorar,&
&               vectsize,coordx(i2+1:i3,:),blocksize,zero,blockvectordumm,vectsize)
    call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorap,&
&               vectsize,coordx(i3+1:i4,:),blocksize,one,blockvectordumm,vectsize)
    blockvectorap=blockvectordumm
!        blockvectorbp=matmul(blockvectorbr,coordx(i2+1:i3,:))+&
!             &matmul(blockvectorbp,coordx(i3+1:i4,:))
    call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorbr,&
&               vectsize,coordx(i2+1:i3,:),blocksize,zero,blockvectordumm,vectsize)
    call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorbp,&
&               vectsize,coordx(i3+1:i4,:),blocksize,one,blockvectordumm,vectsize)
    blockvectorbp=blockvectordumm
    else
!blockvectorp =matmul(blockvectorr,coordx(i2+1:i3,:))
    call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorr,&
&               vectsize,coordx(i2+1:i3,:),blocksize,zero,blockvectorp,vectsize)
!blockvectorap=matmul(blockvectorar,coordx(i2+1:i3,:))
    call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorar,&
&               vectsize,coordx(i2+1:i3,:),blocksize,zero,blockvectorap,vectsize)
!        blockvectorbp=matmul(blockvectorbr,coordx(i2+1:i3,:))
    call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorbr,&
&               vectsize,coordx(i2+1:i3,:),blocksize,zero,blockvectorbp,vectsize)
   end if

!blockvectorx = matmul(blockvectorx,coordx(i1+1:i2,:))+blockvectorp
   call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorx,&
&               vectsize,coordx(i1+1:i2,:),blocksize,zero,blockvectordumm,vectsize)
   blockvectorx = blockvectordumm+blockvectorp
!blockvectorax= matmul(blockvectorax,coordx(i1+1:i2,:))+blockvectorap
   call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorax,&
&               vectsize,coordx(i1+1:i2,:),blocksize,zero,blockvectordumm,vectsize)
   blockvectorax = blockvectordumm+blockvectorap
!blockvectorbx= matmul(blockvectorbx,coordx(i1+1:i2,:))+blockvectorbp
   call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorbx,&
&               vectsize,coordx(i1+1:i2,:),blocksize,zero,blockvectordumm,vectsize)
   blockvectorbx = blockvectordumm+blockvectorbp
   deallocate(coordx)
  end do iter
!epilogue
!gramxbx=matmul(transpose(blockvectorx),blockvectorx)
  if(.false.) then
!call operators(blockvectorx,blockvectorbx)
   call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorx,&
&               vectsize,blockvectorbx,vectsize,zero,gramxbx,blocksize)
!call operatorh(blockvectorx,blockvectorax)
!gramxax=matmul(transpose(blockvectorax),blockvectorx)
   call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorx,&
&               vectsize,blockvectorax,vectsize,zero,gramxax,blocksize)
   allocate(eigen(blocksize))
!call la_sygv(gramxax,gramxbx,eigen,itype=1,jobz='v')
   lambda=zero
   do iblocksize=1,blocksize
    lambda(iblocksize,iblocksize)=eigen(iblocksize)
   end do
!DEBUG
!   write(6,*)'eigen at the end',eigen
!ENDDEBUG
!debug
!blockvectorr=blockvectorax-matmul(blockvectorx,lambda)
!residualnorms=sqrt(sum(blockvectorr**2,dim=1))
!write(6,*)'residualnorm at the end bef orth',residualnorms
!debug
!blockvectorx=matmul(blockvectorx,gramxax)
   call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorx,&
&               vectsize,gramxax,blocksize,zero,blockvectordumm,vectsize)
   blockvectorx=blockvectordumm
!blockvectorax=matmul(blockvectorax,gramxax)
   call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorax,&
&               vectsize,gramxax,blocksize,zero,blockvectordumm,vectsize)
   blockvectorax=blockvectordumm
!blockvectorbx=matmul(blockvectorbx,gramxax)
   call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorbx,&
&               vectsize,gramxax,blocksize,zero,blockvectordumm,vectsize)
   blockvectorbx=blockvectordumm
!blockvectorr=blockvectorax-matmul(blockvectorbx,lambda)
!         call dgemm('n','n',vectsize,blocksize,blocksize,one,blockvectorx,&
!&               vectsize,lambda,blocksize,zero,blockvectordumm,vectsize)
!  blockvectorr=blockvectorax-blockvectordumm
   do iblocksize=1,blocksize
    blockvectorr(:,iblocksize)=blockvectorax(:,iblocksize)-blockvectorbx(:,iblocksize)*eigen(iblocksize)
   end do
   deallocate(eigen)
  end if
  residualnorms=sum(blockvectorr**2,dim=1)
  old_paral_level= mpi_enreg%paral_level
  mpi_enreg%paral_level=3
  call xcomm_init(mpi_enreg,spaceComm)
  call timab(48,1,tsec)

!FB  call timab(541,1,tsec)

  call xsum_mpi(residualnorms,spaceComm,ierr)
  call timab(48,2,tsec)

!FB  call timab(541,2,tsec)

  mpi_enreg%paral_level= old_paral_level
  residualnorms=sqrt(residualnorms)
  do iblocksize=1,blocksize
   iband=iblocksize+(iblock-1)*maxblocksize
   if (mpi_enreg%me_g0 == 1) then
    cg(1,cgindex(iband))=blockvectorx(1,iblocksize)
    cg(2,cgindex(iband))=zero
    cg(1,cgindex(iband)+1:cgindex(iband+1)-1)=blockvectorx(2:rvectsize,iblocksize)/sq2
    cg(2,cgindex(iband)+1:cgindex(iband+1)-1)=blockvectorx(rvectsize+1:vectsize,iblocksize)/sq2
    else
    cg(1,cgindex(iband):cgindex(iband+1)-1)=blockvectorx(1:rvectsize,iblocksize)/sq2
    cg(2,cgindex(iband):cgindex(iband+1)-1)=blockvectorx(rvectsize+1:vectsize,iblocksize)/sq2
   end if
  end do
  if(gen_eigenpb) then
   do iblocksize=1,blocksize
    iband=iblocksize+(iblock-1)*maxblocksize
    if (mpi_enreg%me_g0 == 1) then
     gsc(1,gscindex(iband))=blockvectorbx(1,iblocksize)
     gsc(2,gscindex(iband))=zero
     gsc(1,gscindex(iband)+1:gscindex(iband+1)-1)=blockvectorbx(2:rvectsize,iblocksize)/sq2
     gsc(2,gscindex(iband)+1:gscindex(iband+1)-1)=blockvectorbx(rvectsize+1:vectsize,iblocksize)/sq2
     else
     gsc(1,gscindex(iband):gscindex(iband+1)-1)=blockvectorbx(1:rvectsize,iblocksize)/sq2
     gsc(2,gscindex(iband):gscindex(iband+1)-1)=blockvectorbx(rvectsize+1:vectsize,iblocksize)/sq2
    end if
   end do
  end if
!this should not exist,since this induce one too much getghc.lazy programming....
!call operatorh(blockvectorx,blockvectorax,subham,subvnl)!fill also subham, subvnl
  allocate(cwavef(2,npw_k*nspinor))
  allocate(gwavef(2,npw_k*nspinor),gvnlc(2,npw_k*nspinor))
  isubh=1+2*(iblock-1)*maxblocksize*((iblock-1)*maxblocksize+1)/2
  if (block_to_complete) then
   cwavef(:,:)=zero
  end if
  sij_opt=0
  do iblocksize=1,blocksize
   if (mpi_enreg%me_g0 == 1) then
    cwavef(1,2:npw_k*nspinor)=blockvectorx(2:npw_k*nspinor,iblocksize)/sq2
    cwavef(2,2:npw_k*nspinor)=blockvectorx(npw_k*nspinor+1:2*npw_k*nspinor-1,iblocksize)/sq2
    cwavef(1,1)=blockvectorx(1,iblocksize)
    cwavef(2,1)=zero
    gwavef(1,2:npw_k*nspinor)=blockvectorax(2:npw_k*nspinor,iblocksize)/sq2
    gwavef(2,2:npw_k*nspinor)=blockvectorax(npw_k*nspinor+1:2*npw_k*nspinor-1,iblocksize)/sq2
    gwavef(1,1)=blockvectorax(1,iblocksize)
    gwavef(2,1)=zero
   else
    cwavef(1,1:npw_k*nspinor)=blockvectorx(1:npw_k*nspinor,iblocksize)/sq2
    cwavef(2,1:npw_k*nspinor)=blockvectorx(npw_k*nspinor+1:2*npw_k*nspinor,iblocksize)/sq2
    gwavef(1,1:npw_k*nspinor)=blockvectorax(1:npw_k*nspinor,iblocksize)/sq2
    gwavef(2,1:npw_k*nspinor)=blockvectorax(npw_k*nspinor+1:2*npw_k*nspinor,iblocksize)/sq2
   end if
    choice=1 ; signs=2 ; idir=0 ; tim_nonlop=1 ; cpopt=-1 ; paw_opt=0 ; nnlout=0; nkpg=0; sij_opt=0
   call nonlop(gs_hamk%atindx1,choice,cpopt,cprj_dum,gs_hamk%dimekb1,gs_hamk%dimekb2,&
&              dimffnl,dimffnl,gs_hamk%ekb,&
&              enlout,ffnl,ffnl,gs_hamk%gmet,gs_hamk%gprimd,idir,gs_hamk%indlmn,&
&              istwf_k,kg_k,kg_k,kpg_dum,kpg_dum,gs_hamk%kpoint,gs_hamk%kpoint,dum,lmnmax,matblk,&
&              mgfft,mpi_enreg,mpsang,mpssoang,natom,gs_hamk%nattyp,gs_hamk%ngfft,nkpg,nkpg,&
&              gs_hamk%nloalg,nnlout,npw_k,npw_k,nspinor,ntypat,paw_opt,gs_hamk%phkxred,&
&              gs_hamk%phkxred,gs_hamk%ph1d,ph3d,ph3d,gs_hamk%pspso,signs,nonlop_dum,&
&              nonlop_dum,tim_nonlop,gs_hamk%ucvol,gs_hamk%useylm,cwavef,gvnlc)
   do ii=1,(iblock-1)*maxblocksize+iblocksize
    iwavef=(ii-1)*npw_k*nspinor+icg
    if (mpi_enreg%me_g0 == 1) then
     ipw1=2;chcre=0.5_dp*cg(1,1+iwavef)*gwavef(1,1)
    else
     ipw1=1;chcre=zero
    end if
    if (gs_hamk%usepaw==1) then
     do ipw=ipw1,npw_k*nspinor
      cgreipw=cg(1,ipw+iwavef);cgimipw=cg(2,ipw+iwavef)
      chcre=chcre+cgreipw*gwavef(1,ipw)+cgimipw*gwavef(2,ipw)
     end do
    else
     if (mpi_enreg%me_g0 == 1) then
      cvcre=0.5_dp*cg(1,1+iwavef)*gvnlc(1,1)
     else
      cvcre=zero
     end if
     do ipw=ipw1,npw_k*nspinor
      cgreipw=cg(1,ipw+iwavef);cgimipw=cg(2,ipw+iwavef)
      chcre=chcre+cgreipw*gwavef(1,ipw)+cgimipw*gwavef(2,ipw)
      cvcre=cvcre+cgreipw*gvnlc(1,ipw)+cgimipw*gvnlc(2,ipw)
     end do
!    Store real and imag parts in hermitian storage mode:
     subvnl(isubh)=2.0_dp*cvcre ; subvnl(isubh+1)=zero
    end if
!   Store real and imag parts in hermitian storage mode:
    subham(isubh)=2.0_dp*chcre ; subham(isubh+1)=zero
    isubh=isubh+2
   end do
  end do
!comm for subham is made in vtowfk

  deallocate(cwavef,gwavef,gvnlc)
!call operators(blockvectorx,blockvectorbx,subovl)!fill also  subovl
 if((gen_eigenpb).and.(use_subovl==1)) then
  allocate(cwavef(2,npw_k*nspinor))
  allocate(gwavef(2,npw_k*nspinor))
  isubo=1+2*(iblock-1)*maxblocksize*((iblock-1)*maxblocksize+1)/2
  do iblocksize=1,blocksize
   if (mpi_enreg%me_g0 == 1) then
    cwavef(1,2:npw_k*nspinor)=blockvectorx(2:npw_k*nspinor,iblocksize)/sq2
    cwavef(2,2:npw_k*nspinor)=blockvectorx(npw_k*nspinor+1:2*npw_k*nspinor-1,iblocksize)/sq2
    cwavef(1,1)=blockvectorx(1,iblocksize)
    cwavef(2,1)=zero
   else
    cwavef(1,1:npw_k*nspinor)=blockvectorx(1:npw_k*nspinor,iblocksize)/sq2
    cwavef(2,1:npw_k*nspinor)=blockvectorx(npw_k*nspinor+1:2*npw_k*nspinor,iblocksize)/sq2
   end if
!  Call to nonlop: compute <g|S|c>
   choice=1 ; signs=2 ; idir=0 ; tim_nonlop=1 ; cpopt=-1 ; paw_opt=3 ; nnlout=0 ; nkpg=0
   call nonlop(gs_hamk%atindx1,choice,cpopt,cprj_dum,gs_hamk%dimekb1,0,dimffnl,dimffnl,dummy2,&
&              dummy1,ffnl,ffnl,gs_hamk%gmet,gs_hamk%gprimd,idir,gs_hamk%indlmn,&
&              istwf_k,kg_k,kg_k,kpg_dum,kpg_dum,gs_hamk%kpoint,gs_hamk%kpoint,dum,lmnmax,matblk,&
&              mgfft,mpi_enreg,mpsang,mpssoang,natom,gs_hamk%nattyp,gs_hamk%ngfft,nkpg,nkpg,&
&              gs_hamk%nloalg,nnlout,npw_k,npw_k,nspinor,ntypat,paw_opt,gs_hamk%phkxred,&
&              gs_hamk%phkxred,gs_hamk%ph1d,ph3d,ph3d,gs_hamk%pspso,signs,gs_hamk%sij,&
&              gwavef,tim_nonlop,gs_hamk%ucvol,gs_hamk%useylm,cwavef,cwavef)
   if (mpi_enreg%me_g0 == 1) then
    blockvectorbx(2:npw_k*nspinor,iblocksize)=gwavef(1,2:npw_k*nspinor)*sq2
    blockvectorbx(npw_k*nspinor+1:2*npw_k*nspinor-1,iblocksize)=gwavef(2,2:npw_k*nspinor)*sq2
    blockvectorbx(1,iblocksize)=gwavef(1,1)
   else
    blockvectorbx(1:npw_k*nspinor,iblocksize)=gwavef(1,1:npw_k*nspinor)*sq2
    blockvectorbx(npw_k*nspinor+1:2*npw_k*nspinor,iblocksize)=gwavef(2,1:npw_k*nspinor)*sq2
   end if
   do ii=1,(iblock-1)*maxblocksize+iblocksize
    iwavef=(ii-1)*npw_k*nspinor+icg
    if (istwf_k==2 .and. mpi_enreg%me_g0 == 1) then
     ipw1=2;cscre=0.5_dp*cg(1,1+iwavef)*gwavef(1,1)
    else
     ipw1=1;cscre=zero
    end if
    do ipw=ipw1,npw_k*nspinor
     cscre=cscre+cg(1,ipw+iwavef)*gwavef(1,ipw)+cg(2,ipw+iwavef)*gwavef(2,ipw)
    end do
    cscre=2.0_dp*cscre
!   Store real and imag parts in hermitian storage mode:
    subovl(isubo)=cscre ; subovl(isubo+1)=zero
    isubo=isubo+2
   end do
  end do
  deallocate(cwavef,gwavef)
 end if
!DEBUG
!write(6,*)'residualnorm at the end',residualnorms
!ENDDEBUG

 deallocate(blockvectory,blockvectorby,gramyx)
 deallocate(blockvectorbx,blockvectorbr,blockvectorbp) ! added by MM
 deallocate(blockvectorx,blockvectorax)
 deallocate(blockvectorr,blockvectorar)
 deallocate(blockvectorp,blockvectorap)
 deallocate(blockvectordumm)
 deallocate(gramxax,gramxar,gramxap,gramrar,gramrap,grampap,gramxbx,gramxbr,&
& gramxbp,gramrbr,gramrbp,grampbp,transf)
 deallocate(lambda)
 deallocate(residualnorms)

!End big loop over bands inside blocks
 end do

 call timab(530,2,tsec)

!DEBUG
 write(6,*)' lobpcgwf : exit '
!ENDDEBUG

end subroutine lobpcgwf
!!***
subroutine orthonormalize(blockvectorx,blockvectorbx,blocksize,mpi_enreg,sqgram,vectsize)

 use defs_basis
 use defs_datatypes

!This section has been created automatically by the script Abilint (TD). Do not modify these by hand.
#ifdef HAVE_FORTRAN_INTERFACES
 use interfaces_01manage_mpi
 use interfaces_lib01hidempi
#else
 use defs_xfuncmpi
#endif
!End of the abilint section

 implicit none

!Arguments ------------------------------------
 integer, intent(in) :: blocksize,vectsize
 real(dp) :: blockvectorx(vectsize,blocksize)
 real(dp) :: blockvectorbx(vectsize,blocksize),sqgram(blocksize,blocksize)
 type(mpi_type) :: mpi_enreg

!Local variables-------------------------------
!  real(dp) :: zero=0.0_dp, one=1.0_dp
 integer :: iblocksize,ierr,info,jblocksize
 integer :: old_paral_level,spaceComm
 real(dp) :: tsec(2)

#ifdef VMS
!DEC$ ATTRIBUTES ALIAS:'DGEMM' :: dgemm
!DEC$ ATTRIBUTES ALIAS:'DPOTRF' :: dpotrf
!DEC$ ATTRIBUTES ALIAS:'DSYEV' :: dsyev
!DEC$ ATTRIBUTES ALIAS:'DSYGV' :: dsygv
!DEC$ ATTRIBUTES ALIAS:'DTRSM' :: dtrsm
#endif
  call dgemm('t','n',blocksize,blocksize,vectsize,one,blockvectorx,&
       &               vectsize,blockvectorbx,vectsize,zero,sqgram,blocksize)
  old_paral_level= mpi_enreg%paral_level
  mpi_enreg%paral_level=3
  call xcomm_init(mpi_enreg,spaceComm)
  call timab(48,1,tsec)

!FB  call timab(541,1,tsec)

  call xsum_mpi(sqgram,spaceComm,ierr)
  call timab(48,2,tsec)

!FB  call timab(541,2,tsec)

  mpi_enreg%paral_level= old_paral_level
  call dpotrf('u',blocksize,sqgram,blocksize,info)
  call dtrsm('r','u','n','n',vectsize,blocksize,one,sqgram,blocksize,&
       &              blockvectorx,vectsize)
end subroutine orthonormalize
!{\src2tex{textfont=tt}}
!!****f* abinit/precon2
!!
!! name
!! precon2
!!
!! function
!! precondition $<g|(h-e_{n,k})|c_{n,k}>$
!!
!! copyright
!! Copyright (C) 1998-2007 ABINIT group (dca, xg, gmr)
!! this file is distributed under the terms of the
!! gnu general public license, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! for the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! inputs
!!  $cg(2,npw)=<g|c_{n,k}>$.
!!  $eval=current band eigenvalue=<c_{n,k}|h|c_{n,k}>$.
!!  istwf_k=option parameter that describes the storage of wfs
!!  kinpw(npw)=(modified) kinetic energy for each plane wave (hartree)
!!  nspinor=number of spinorial components of the wavefunctions
!!  $vect(2,npw)=<g|h|c_{n,k}>$.
!!  npw=number of planewaves at this k point.
!!  vectsize= size of vectors
!!
!! output
!!  $vect(2,npw)=<g|(h-eval)|c_{n,k}>*(polynomial ratio)$
!!
!! parents
!!      cgwf3
!!
!! children
!!      wrtout
!!
!! source

subroutine precon2(cg,eval,istwf_k,kinpw,mpi_enreg,npw,nspinor,ghc,vect,vectsize)

 use defs_basis
 use defs_datatypes

!This section has been created automatically by the script Abilint (TD). Do not modify these by hand.
#ifdef HAVE_FORTRAN_INTERFACES
 use interfaces_01manage_mpi
 use interfaces_lib01hidempi
#else
 use defs_xfuncmpi
#endif
!End of the abilint section

 implicit none

!arguments ------------------------------------
 integer :: istwf_k,npw,nspinor,vectsize
 real(dp) :: eval
 real(dp) :: cg(vectsize),ghc(vectsize),kinpw(npw),vect(vectsize)
 type(mpi_type) :: mpi_enreg

!Local variables-------------------------------
 integer :: ierr,ig,igs,ipw1,ispinor,old_paral_level,spaceComm
 real(dp) :: ek0,ek0_inv,fac,poly,xx
 real(dp) :: tsec(2)
 character(len=500) :: message

! *************************************************************************
!
!compute mean kinetic energy of band
 if(istwf_k==1)then
  ek0=0.0_dp
  do ispinor=1,nspinor
   igs=(ispinor-1)*npw
!$omp parallel do private(ig) reduction(+:ek0) &
!$omp&shared(cg,igs,kinpw,npw)
   do ig=1+igs,npw+igs
    if(kinpw(ig-igs)<huge(0.0_dp)*1.d-11)then
     ek0=ek0+kinpw(ig-igs)*(cg(ig)**2)
    end if
   end do
!$omp end parallel do
  end do
 else if (istwf_k>=2)then
  if (istwf_k==2 .and. mpi_enreg%me_g0 == 1)then
   ek0=0.0_dp ; ipw1=2
   if(kinpw(1)<huge(0.0_dp)*1.d-11)ek0=0.5_dp*kinpw(1)*cg(1)**2
!$omp parallel do private(ig) reduction(+:ek0) &
!$omp&shared(cg,ipw1,kinpw,npw)
    do ig=ipw1,npw
     if(kinpw(ig)<huge(0.0_dp)*1.d-11)then
      ek0=ek0+kinpw(ig)*(cg(ig)**2+cg(ig+npw-1)**2)
     end if
    end do
!$omp end parallel do
    else
    ek0=0.0_dp ; ipw1=1
!$omp parallel do private(ig) reduction(+:ek0) &
!$omp&shared(cg,ipw1,kinpw,npw)
    do ig=ipw1,npw
     if(kinpw(ig)<huge(0.0_dp)*1.d-11)then
      ek0=ek0+kinpw(ig)*(cg(ig)**2+cg(ig+npw)**2)
     end if
    end do
!$omp end parallel do
  end if
 end if

!debug
!write(6,*)' precon : ek0,kinpw(1)=',ek0,kinpw(1)
!enddebug

  old_paral_level= mpi_enreg%paral_level
  mpi_enreg%paral_level=3
  call xcomm_init(mpi_enreg,spaceComm)
  call timab(48,1,tsec)

!FB  call timab(541,1,tsec)

  call xsum_mpi(ek0,spaceComm,ierr)
  call timab(48,2,tsec)

!FB  call timab(541,2,tsec)

  mpi_enreg%paral_level= old_paral_level

 if(ek0<1.0d-10)then
  write(message, '(a,a,a,a,a,a)' )ch10,&
&  ' precon : warning -',ch10,&
&  '  the mean kinetic energy of a wavefunction vanishes.',ch10,&
&  '  it is reset to 0.1ha.'
  call wrtout(6,message,'pers')
  ek0=0.1_dp
 end if

 ek0_inv=1.0_dp/ek0
!
!carry out preconditioning
 do ispinor=1,nspinor
  igs=(ispinor-1)*npw
  if (istwf_k==2 .and. mpi_enreg%me_g0 == 1) then
!$omp parallel do private(fac,ig,poly,xx) &
!$omp&shared(cg,ek0_inv,eval,kinpw,igs,npw,vect)
   do ig=1+igs,1+igs !g=0
    if(kinpw(ig-igs)<huge(0.0_dp)*1.d-11)then
     xx=kinpw(ig-igs)*ek0_inv
!    teter polynomial ratio
     poly=27._dp+xx*(18._dp+xx*(12._dp+xx*8._dp))
     fac=poly/(poly+16._dp*xx**4)
     vect(ig)=( ghc(ig)-eval*cg(ig) )*fac
    else
     vect(ig)=0.0_dp
    end if
   end do
   do ig=2+igs,npw+igs
    if(kinpw(ig-igs)<huge(0.0_dp)*1.d-11)then
     xx=kinpw(ig-igs)*ek0_inv
!    teter polynomial ratio
     poly=27._dp+xx*(18._dp+xx*(12._dp+xx*8._dp))
     fac=poly/(poly+16._dp*xx**4)
     vect(ig)=( ghc(ig)-eval*cg(ig) )*fac
     vect(ig+npw-1)=( ghc(ig+npw-1)-eval*cg(ig+npw-1) )*fac
    else
     vect(ig)=zero
     vect(ig+npw-1)=zero
    end if
   end do
   else
   do ig=1+igs,npw+igs
    if(kinpw(ig-igs)<huge(0.0_dp)*1.d-11)then
     xx=kinpw(ig-igs)*ek0_inv
!    teter polynomial ratio
     poly=27._dp+xx*(18._dp+xx*(12._dp+xx*8._dp))
     fac=poly/(poly+16._dp*xx**4)
     vect(ig)=( ghc(ig)-eval*cg(ig) )*fac
     vect(ig+npw)=( ghc(ig+npw)-eval*cg(ig+npw) )*fac
    else
     vect(ig)=zero
     vect(ig+npw)=zero
    end if
   end do
  end if
!$omp end parallel do
 end do
end subroutine precon2
!!***
