diff --git a/OverlapHelement.C b/OverlapHelement.C
index 18246dc..8362c37 100644
--- a/OverlapHelement.C
+++ b/OverlapHelement.C
@@ -39,7 +39,7 @@ int main(int argc, char* argv []) {
   //readMPSFromDiskAndInitializeStaticVariables();
   //initializeGlobalMPS(mpsstate);
 
-  if (rank =0) 
+  if (rank == 0)
     printf("Reading file %s\n", argv[2]);
   test(argv[2]);
   //exit(0);
diff --git a/StackOperators.C b/StackOperators.C
index 0344332..8dc5056 100644
--- a/StackOperators.C
+++ b/StackOperators.C
@@ -493,14 +493,14 @@ void SpinAdapted::StackDesCre::build(StackMatrix& m, int row, int col, const Sta
   {
     const boost::shared_ptr<StackSparseMatrix> op1 = leftBlock->get_op_rep(DES, -getSpinQuantum(i), i);
     boost::shared_ptr<StackSparseMatrix> op2 = rightBlock->get_op_rep(CRE, getSpinQuantum(j), j);
-    SpinAdapted::operatorfunctions::TensorProductElement(leftBlock, *op1, *op2, &b, &(b.get_stateInfo()), *this, m, row, col, 1.0);
+    double parity = getCommuteParity(op1->get_deltaQuantum()[0], op2->get_deltaQuantum()[0], get_deltaQuantum()[0]);
+    SpinAdapted::operatorfunctions::TensorProductElement(rightBlock, *op2, *op1, &b, &(b.get_stateInfo()), *this, m, row, col, parity);
   }
   else if (rightBlock->get_op_array(DES).has(i))
   {
     const boost::shared_ptr<StackSparseMatrix> op1 = rightBlock->get_op_rep(DES, -getSpinQuantum(i), i);
     boost::shared_ptr<StackSparseMatrix> op2 = leftBlock->get_op_rep(CRE, getSpinQuantum(j), j);
-    double parity = getCommuteParity(-op1->get_deltaQuantum()[0], op2->get_deltaQuantum()[0], get_deltaQuantum()[0]);
-    SpinAdapted::operatorfunctions::TensorProductElement(rightBlock, *op1, *op2, &b, &(b.get_stateInfo()), *this, m, row, col, 1.0*parity);
+    SpinAdapted::operatorfunctions::TensorProductElement(leftBlock, *op2, *op1, &b, &(b.get_stateInfo()), *this, m, row, col, 1.0);
   }
   else
     abort();  
@@ -549,19 +549,19 @@ void SpinAdapted::StackDesCre::build(const StackSpinBlock& b)
     SpinAdapted::operatorfunctions::TensorProduct(rightBlock, *op, *Overlap, &b, &(b.get_stateInfo()), *this, 1.0);
     dmrginp.makeopsT -> stop();
     return;
-  }  
+  }
   if (leftBlock->get_op_array(DES).has(i))
   {
     const boost::shared_ptr<StackSparseMatrix> op1 = leftBlock->get_op_rep(DES, -getSpinQuantum(i), i);
     const boost::shared_ptr<StackSparseMatrix> op2 = rightBlock->get_op_rep(CRE, getSpinQuantum(j), j);
-    SpinAdapted::operatorfunctions::TensorProduct(rightBlock, *op2, *op1, &b, &(b.get_stateInfo()), *this, 1.0);
+    double parity = getCommuteParity(op1->get_deltaQuantum()[0], op2->get_deltaQuantum()[0], get_deltaQuantum()[0]);
+    SpinAdapted::operatorfunctions::TensorProduct(rightBlock, *op2, *op1, &b, &(b.get_stateInfo()), *this, parity);
   }
   else if (rightBlock->get_op_array(DES).has(i))
   {
     const boost::shared_ptr<StackSparseMatrix> op1 = rightBlock->get_op_rep(DES, -getSpinQuantum(i), i);
     const boost::shared_ptr<StackSparseMatrix> op2 = leftBlock->get_op_rep(CRE, getSpinQuantum(j), j);
-    double parity = getCommuteParity(op1->get_deltaQuantum()[0], op2->get_deltaQuantum()[0], get_deltaQuantum()[0]);    
-    SpinAdapted::operatorfunctions::TensorProduct(rightBlock, *op1, *op2, &b, &(b.get_stateInfo()), *this, 1.0*parity);
+    SpinAdapted::operatorfunctions::TensorProduct(leftBlock, *op2, *op1, &b, &(b.get_stateInfo()), *this, 1.0);
   }
   else
     abort();  
@@ -934,9 +934,9 @@ void SpinAdapted::StackCreDesComp::buildfromCreDes(StackSpinBlock& b)
   TensorOp CD1(i, j, 1, -1, (-deltaQuantum[0].get_s()).getirrep(), (-sym).getirrep());
 
   std::vector<boost::shared_ptr<StackSparseMatrix> >  allops1;
-  int numCD = 0, numDC = 0;
+  int numCD = 0, numDC = 0, numCC = 0;
   std::vector<double> scaleCD, scaleCC;
-  if (dmrginp.hamiltonian() == BCS) scaleCC.resize(b.get_op_array(CRE_CRE).get_size()*2, 0.0);
+  if (dmrginp.hamiltonian() == BCS) scaleCC.resize(b.get_op_array(CRE_CRE).get_size()*4, 0.0);
   //for each CD there is spin0, spin 1 and normal and transpose
   if (dmrginp.spinAdapted()) scaleCD.resize(b.get_op_array(CRE_DES).get_size()*8, 0.0);
   else scaleCD.resize(b.get_op_array(CRE_DES).get_size()*4, 0.0);
@@ -944,51 +944,52 @@ void SpinAdapted::StackCreDesComp::buildfromCreDes(StackSpinBlock& b)
     for (int ji=0; ji<b.get_op_array(CRE_DES).get_local_element(ii).size(); ji++) 
       if (b.get_op_array(CRE_DES).get_local_element(ii)[ji]->get_deltaQuantum(0) == deltaQuantum[0] || 
           b.get_op_array(CRE_DES).get_local_element(ii)[ji]->get_deltaQuantum(0) == -deltaQuantum[0]) {
-	allops1.push_back(b.get_op_array(CRE_DES).get_local_element(ii)[ji]);
-	const int k = allops1.back()->get_orbs()[0];
-	const int l = allops1.back()->get_orbs()[1];
-	
-	//TensorOp CK(k,1), DL(l,-1);
-	//TensorOp CD2 = CK.product(DL, spin, sym.getirrep());
-        TensorOp CD2(k, l, 1, -1, spin, sym.getirrep());
-	if (!CD2.empty) 
-	  scaleCD[2*(allops1.size()-1)] = calcCompfactor(CD1, CD2, CD,*(b.get_twoInt()), b.get_integralIndex());
-	
-	//CK=TensorOp(l,1); DL=TensorOp(k,-1);      
-	//CD2 = CK.product(DL, spin, sym.getirrep());
-        CD2 = TensorOp(l, k, 1, -1, spin, sym.getirrep());
-	if (!b.has(DES) && l!=k && !CD2.empty) {
-	  double scaleV = calcCompfactor(CD1, CD2, CD,*(b.get_twoInt()), b.get_integralIndex());
-          if (dmrginp.spinAdapted()) {
-            scaleV *= getCommuteParity(getSpinQuantum(l), getSpinQuantum(k), get_deltaQuantum()[0]);
-          }
-	  scaleCD[2*(allops1.size()-1) + 1] = scaleV;
-	}
+        allops1.push_back(b.get_op_array(CRE_DES).get_local_element(ii)[ji]);
+        const int k = allops1.back()->get_orbs()[0];
+        const int l = allops1.back()->get_orbs()[1];
+
+        //TensorOp CK(k,1), DL(l,-1);
+        //TensorOp CD2 = CK.product(DL, spin, sym.getirrep());
+              TensorOp CD2(k, l, 1, -1, spin, sym.getirrep());
+        if (!CD2.empty) 
+          scaleCD[2*(allops1.size()-1)] = calcCompfactor(CD1, CD2, CD,*(b.get_twoInt()), b.get_integralIndex());
+        
+        //CK=TensorOp(l,1); DL=TensorOp(k,-1);      
+        //CD2 = CK.product(DL, spin, sym.getirrep());
+              CD2 = TensorOp(l, k, 1, -1, spin, sym.getirrep());
+        if (!b.has(DES) && l!=k && !CD2.empty) {
+          double scaleV = calcCompfactor(CD1, CD2, CD,*(b.get_twoInt()), b.get_integralIndex());
+                if (dmrginp.spinAdapted()) {
+                  scaleV *= getCommuteParity(getSpinQuantum(l), getSpinQuantum(k), get_deltaQuantum()[0]);
+                }
+          scaleCD[2*(allops1.size()-1) + 1] = scaleV;
+        }
       }
   numCD = allops1.size();
 
   if(b.has(DES)) {
     for (int ii=0; ii<b.get_op_array(DES_CRE).get_size(); ii++)
       for (int ji=0; ji<b.get_op_array(DES_CRE).get_local_element(ii).size(); ji++) 
-	if (b.get_op_array(DES_CRE).get_local_element(ii)[ji]->get_deltaQuantum(0) == -deltaQuantum[0]) {
-	  allops1.push_back(b.get_op_array(DES_CRE).get_local_element(ii)[ji]);
-	  
-	  const int l = allops1.back()->get_orbs()[0];
-	  const int k = allops1.back()->get_orbs()[1];
-	  
-	  if (k!=l) {
-	    //TensorOp CK(k,1), DL(l,-1);      
-	    //TensorOp CD2 = CK.product(DL, spin, sym.getirrep());
-            TensorOp CD2(k, l, 1, -1, spin, sym.getirrep());
-	    if (!CD2.empty) {
-	      double scaleV = calcCompfactor(CD1, CD2, CD,*(b.get_twoInt()), b.get_integralIndex());
+        if (b.get_op_array(DES_CRE).get_local_element(ii)[ji]->get_deltaQuantum(0) == deltaQuantum[0]) {
+          allops1.push_back(b.get_op_array(DES_CRE).get_local_element(ii)[ji]);
+          
+          const int l = allops1.back()->get_orbs()[0];
+          const int k = allops1.back()->get_orbs()[1];
+
+          if (k!=l) {
+            //TensorOp CK(k,1), DL(l,-1);      
+            //TensorOp CD2 = CK.product(DL, spin, sym.getirrep());
+                  TensorOp CD2(k, l, 1, -1, spin, sym.getirrep());
+            if (!CD2.empty) {
+              double scaleV = calcCompfactor(CD1, CD2, CD,*(b.get_twoInt()), b.get_integralIndex());
+              // FIXME does this need to be used for nonspinadapted as well?
               if (dmrginp.spinAdapted()) {
-		scaleV *= getCommuteParity(getSpinQuantum(l), getSpinQuantum(k), get_deltaQuantum()[0]);
+        	      scaleV *= getCommuteParity(getSpinQuantum(l), getSpinQuantum(k), get_deltaQuantum()[0]);
               }
-	      scaleCD[2*(allops1.size()-1)] = scaleV;
-	    }
-	  }
-	}
+              scaleCD[2*(allops1.size()-1)] = scaleV;
+            }
+          }
+        }
   }
 
 
@@ -1013,7 +1014,7 @@ void SpinAdapted::StackCreDesComp::buildfromCreDes(StackSpinBlock& b)
             //CK = TensorOp(k,1);
             //TensorOp CC2_commute = CL.product(CK, spin, sym.getirrep(), k==l);
             TensorOp CC2_commute(l, k, 1, 1, spin, sym.getirrep(), k == l);
-            int parity = getCommuteParity(getSpinQuantum(l),getSpinQuantum(k),get_deltaQuantum()[0]);              
+            int parity = getCommuteParity(getSpinQuantum(l),getSpinQuantum(k),get_deltaQuantum()[1]);
             scaleCC[2*(allops1.size()-numDC-1)] = calcCompfactor(CD1, CC2, CD, v_cccd[b.get_integralIndex()]);
             if (k != l)
               scaleCC[2*(allops1.size()-numDC-1)] += parity * calcCompfactor(CD1, CC2_commute, CD, v_cccd[b.get_integralIndex()]);
@@ -1029,16 +1030,32 @@ void SpinAdapted::StackCreDesComp::buildfromCreDes(StackSpinBlock& b)
             //DL = TensorOp(l,-1);
             //TensorOp DD2_commute = DK.product(DL, spin, sym.getirrep(), k == l);
             TensorOp DD2_commute(k, l, -1, -1, spin, sym.getirrep(), k == l);
-            int parity = getCommuteParity(getSpinQuantum(l),getSpinQuantum(k),get_deltaQuantum()[0]);
-            scaleCC[2*(allops1.size()-numCD-1)+1] = calcCompfactor(CD1, DD2, CD, v_cccd[b.get_integralIndex()]);
+            int parity = getCommuteParity(-getSpinQuantum(l),-getSpinQuantum(k),get_deltaQuantum()[2]);
+            scaleCC[2*(allops1.size()-numDC-1)+1] = calcCompfactor(CD1, DD2, CD, v_cccd[b.get_integralIndex()]);
             if (k != l)
-              scaleCC[2*(allops1.size()-numCD-1)+1] += parity * calcCompfactor(CD1, DD2_commute, CD, v_cccd[b.get_integralIndex()]);
+              scaleCC[2*(allops1.size()-numDC-1)+1] += parity * calcCompfactor(CD1, DD2_commute, CD, v_cccd[b.get_integralIndex()]);
           }
         }
-
+        numCC = allops1.size();
     if (b.has(DES)) {
-      pout << "buildfromCreDes with DES in BCS not implemented" << endl;
-      abort();
+      for (int ii = 0; ii < b.get_op_array(DES_DES).get_size(); ++ii)
+        for (int ji = 0; ji < b.get_op_array(DES_DES).get_local_element(ii).size(); ++ji)
+        if (b.get_op_array(DES_DES).get_local_element(ii)[ji]->get_deltaQuantum(0).get_s() == deltaQuantum[0].get_s() || 
+            b.get_op_array(DES_DES).get_local_element(ii)[ji]->get_deltaQuantum(0).get_s() == -deltaQuantum[0].get_s()) {
+          allops1.push_back(b.get_op_array(DES_DES).get_local_element(ii)[ji]);
+
+          const int k = allops1.back()->get_orbs()[0];
+          const int l = allops1.back()->get_orbs()[1];
+          TensorOp DD2(k, l, -1, -1, spin, sym.getirrep(), k == l);
+
+          if (!DD2.empty) {
+            TensorOp DD2_commute(l, k, -1, -1, spin, sym.getirrep(), k == l);
+            int parity = getCommuteParity(-getSpinQuantum(l),-getSpinQuantum(k),get_deltaQuantum()[2]);
+            scaleCC[2*(allops1.size()-numDC-1)] = calcCompfactor(CD1, DD2, CD, v_cccd[b.get_integralIndex()]);
+            if (k != l)
+              scaleCC[2*(allops1.size()-numDC-1)] += parity * calcCompfactor(CD1, DD2_commute, CD, v_cccd[b.get_integralIndex()]);
+          }
+        }
     }
   }
 
@@ -1065,33 +1082,37 @@ void SpinAdapted::StackCreDesComp::buildfromCreDes(StackSpinBlock& b)
     //this is a cd operator
     if (opindex <numCD) {
       if (allops1[opindex]->allowed(lQ, rQ) && allowed(lQ, rQ) && fabs(scaleCD[2*opindex]) >TINY) 
-	MatrixScaleAdd(scaleCD[2*opindex], allops1[opindex]->operator_element(lQ, rQ), this->operator_element(lQ, rQ));
+        MatrixScaleAdd(scaleCD[2*opindex], allops1[opindex]->operator_element(lQ, rQ), this->operator_element(lQ, rQ));
       
       if (!b.has(DES) && l!=k && fabs(scaleCD[2*opindex+1]) > TINY) {
-	if (allowed(lQ, rQ) && allops1[opindex]->allowed(rQ, lQ)) {
-	  double scaling = getStandAlonescaling(-(allops1[opindex]->get_deltaQuantum(0)), b.get_braStateInfo().quanta[lQ], b.get_ketStateInfo().quanta[rQ]);
-	  int nrows = operator_element(lQ, rQ).Nrows();
-	  int ncols = operator_element(lQ, rQ).Ncols();
-	  for (int row=0; row<nrows; row++)
-	    DAXPY(ncols, scaling*scaleCD[2*opindex+1], &(allops1[opindex]->operator_element(rQ, lQ)(1, row+1)), nrows, &(this->operator_element(lQ, rQ)(row+1, 1)), 1); 
-	}
+        if (allowed(lQ, rQ) && allops1[opindex]->allowed(rQ, lQ)) {
+          double scaling = getStandAlonescaling(-(allops1[opindex]->get_deltaQuantum(0)), b.get_braStateInfo().quanta[lQ], b.get_ketStateInfo().quanta[rQ]);
+          int nrows = operator_element(lQ, rQ).Nrows();
+          int ncols = operator_element(lQ, rQ).Ncols();
+          for (int row=0; row<nrows; row++)
+            DAXPY(ncols, scaling*scaleCD[2*opindex+1], &(allops1[opindex]->operator_element(rQ, lQ)(1, row+1)), nrows, &(this->operator_element(lQ, rQ)(row+1, 1)), 1); 
+        }
       }
     } else if (opindex < numDC) {
       if (k!=l && allops1[opindex]->allowed(lQ, rQ) && allowed(lQ, rQ) && fabs(scaleCD[2*opindex]) > TINY)  
 	MatrixScaleAdd(scaleCD[2*opindex], allops1[opindex]->operator_element(lQ, rQ), this->operator_element(lQ, rQ));
-    } else { // CC
+    } else if (opindex < numCC) { // CC
       if (allops1[opindex]->allowed(lQ, rQ) && allowed(lQ, rQ) && fabs(scaleCC[2*(opindex-numDC)]) > TINY)
         MatrixScaleAdd(scaleCC[2*(opindex-numDC)], allops1[opindex]->operator_element(lQ, rQ), this->operator_element(lQ, rQ));
       
-      if (!b.has(DES) &&  fabs(scaleCC[2*(opindex-numDC)+1]) > TINY) {
-        if (allowed(lQ, rQ) && allops1[opindex]->allowed(rQ, lQ)) {
-	  double scaling = getStandAlonescaling(-(allops1[opindex]->get_deltaQuantum(0)), b.get_braStateInfo().quanta[lQ], b.get_ketStateInfo().quanta[rQ]);
+      if (!b.has(DES)) {
+        if (fabs(scaleCC[2*(opindex-numDC)+1]) > TINY && allowed(lQ, rQ) && allops1[opindex]->allowed(rQ, lQ)) {
+          double scaling = getStandAlonescaling(-(allops1[opindex]->get_deltaQuantum(0)), b.get_braStateInfo().quanta[lQ], b.get_ketStateInfo().quanta[rQ]);
           int nrows = operator_element(lQ, rQ).Nrows();
           int ncols = operator_element(lQ, rQ).Ncols();
           for (int row=0; row<nrows; ++row)
             DAXPY(ncols, scaling*scaleCC[2*(opindex-numDC)+1], &(allops1[opindex]->operator_element(rQ, lQ)(1, row+1)), nrows, &(this->operator_element(lQ, rQ)(row+1, 1)), 1);
         }
       }
+    } else {
+      if (fabs(scaleCC[2*(opindex-numDC)]) > TINY && allowed(lQ, rQ) && allops1[opindex]->allowed(lQ, rQ)) {
+        MatrixScaleAdd(scaleCC[2*(opindex-numDC)], allops1[opindex]->operator_element(lQ, rQ), this->operator_element(lQ, rQ));
+      }
     }
   }
 
@@ -1238,7 +1259,7 @@ void SpinAdapted::StackCreDesComp::build(StackMatrix& m, int row, int col, const
             TensorOp DD2_commute(k, l, -1, -1, spin, sym.getirrep(), k==l);
             double scaleV2 = calcCompfactor(CD1, DD2_commute, CD, v_cccd[b.get_integralIndex()]);
             if (fabs(scaleV2)+fabs(scaleV) > dmrginp.twoindex_screen_tol()) {
-              if (leftBlock->has(DES)) {
+              if (leftBlock->has(DES) && rightBlock->has(DES)) {
                 boost::shared_ptr<StackSparseMatrix> op1 = rightBlock->get_op_rep(DES, -getSpinQuantum(l), l);
                 boost::shared_ptr<StackSparseMatrix> op2 = leftBlock->get_op_rep(DES, -getSpinQuantum(k), k);
                 double parity =  getCommuteParity(op1->get_deltaQuantum()[0], op2->get_deltaQuantum()[0], get_deltaQuantum()[2]);
@@ -1523,7 +1544,7 @@ void SpinAdapted::StackCreDesComp::build(const StackSpinBlock& b)
             TensorOp DD2_commute(k, l, -1, -1, spin, sym.getirrep(), k==l);
             double scaleV2 = calcCompfactor(CD1, DD2_commute, CD, v_cccd[b.get_integralIndex()]);
             if (fabs(scaleV2)+fabs(scaleV) > dmrginp.twoindex_screen_tol()) {
-              if (leftBlock->has(DES)) {
+              if (leftBlock->has(DES) && rightBlock->has(DES)) {
                 boost::shared_ptr<StackSparseMatrix> op1 = rightBlock->get_op_rep(DES, -getSpinQuantum(l), l);
                 boost::shared_ptr<StackSparseMatrix> op2 = leftBlock->get_op_rep(DES, -getSpinQuantum(k), k);
                 double parity =  getCommuteParity(op1->get_deltaQuantum()[0], op2->get_deltaQuantum()[0], get_deltaQuantum()[2]);
@@ -1861,7 +1882,6 @@ void SpinAdapted::StackDesCreComp::buildfromDesCre(StackSpinBlock& b)
   const int j = get_orbs()[1];
 
   memset(data, 0, totalMemory * sizeof(double));
-
   //StackDesCreComp* op_array; 
   //initiateMultiThread(this, op_array, numthrds);
 
@@ -1870,10 +1890,10 @@ void SpinAdapted::StackDesCreComp::buildfromDesCre(StackSpinBlock& b)
   //TensorOp CD1 = C.product(D, (-deltaQuantum[0].get_s()).getirrep(), (-sym).getirrep()); // the operator to be complimentaried
   TensorOp CD1(j, i, 1, -1, (-deltaQuantum[0].get_s()).getirrep(), (-sym).getirrep());
 
-  std::vector<boost::shared_ptr<StackSparseMatrix> >  allops1, allops2;
+  std::vector<boost::shared_ptr<StackSparseMatrix> >  allops1, allops2, allops3, allops4;
   for (int ii=0; ii<b.get_op_array(CRE_DES).get_size(); ii++)
     for (int ji=0; ji<b.get_op_array(CRE_DES).get_local_element(ii).size(); ji++) 
-      if (b.get_op_array(CRE_DES).get_local_element(ii)[ji]->get_deltaQuantum(0) == -deltaQuantum[0])
+      if (b.get_op_array(CRE_DES).get_local_element(ii)[ji]->get_deltaQuantum(0) == deltaQuantum[0])
 	allops1.push_back(b.get_op_array(CRE_DES).get_local_element(ii)[ji]);
 
   for (int ii=0; ii<b.get_op_array(DES_CRE).get_size(); ii++)
@@ -1881,6 +1901,18 @@ void SpinAdapted::StackDesCreComp::buildfromDesCre(StackSpinBlock& b)
       if (b.get_op_array(DES_CRE).get_local_element(ii)[ji]->get_deltaQuantum(0) == deltaQuantum[0])
 	allops2.push_back(b.get_op_array(DES_CRE).get_local_element(ii)[ji]);
 
+  if (dmrginp.hamiltonian() == BCS) {
+    for (int ii = 0; ii < b.get_op_array(CRE_CRE).get_size(); ++ii)
+      for (int ji = 0; ji < b.get_op_array(CRE_CRE).get_local_element(ii).size(); ++ji)
+        if (b.get_op_array(CRE_CRE).get_local_element(ii)[ji]->get_deltaQuantum(0) == deltaQuantum[1])
+        allops3.push_back(b.get_op_array(CRE_CRE).get_local_element(ii)[ji]);
+
+    for (int ii = 0; ii < b.get_op_array(DES_DES).get_size(); ++ii)
+      for (int ji = 0; ji < b.get_op_array(DES_DES).get_local_element(ii).size(); ++ji)
+        if (b.get_op_array(DES_DES).get_local_element(ii)[ji]->get_deltaQuantum(0) == deltaQuantum[2])
+        allops4.push_back(b.get_op_array(DES_DES).get_local_element(ii)[ji]);
+  }
+
   //#pragma omp parallel for schedule(dynamic)
   for (int ii = 0; ii<allops1.size(); ii++) {
     const int k = allops1[ii]->get_orbs()[0];
@@ -1895,7 +1927,7 @@ void SpinAdapted::StackDesCreComp::buildfromDesCre(StackSpinBlock& b)
     }
 
   }
-  
+
   //#pragma omp parallel for schedule(dynamic)
   for (int ii = 0; ii<allops2.size(); ii++) {
     const int l = allops2[ii]->get_orbs()[0];
@@ -1907,12 +1939,50 @@ void SpinAdapted::StackDesCreComp::buildfromDesCre(StackSpinBlock& b)
     TensorOp CD2(k, l, 1, -1, spin, sym.getirrep());
     if (!CD2.empty) {
       double scaleV = calcCompfactor(CD1, CD2, CD,*(b.get_twoInt()), b.get_integralIndex());
-      //scaleV *= getCommuteParity(getSpinQuantum(i), getSpinQuantum(j), get_deltaQuantum()[0]);
-      scaleV *= getCommuteParity(getSpinQuantum(l), -getSpinQuantum(k), get_deltaQuantum()[0]);
+      if (dmrginp.spinAdapted()) {
+        scaleV *= getCommuteParity(getSpinQuantum(l), -getSpinQuantum(k), get_deltaQuantum()[0]);
+      }
       ScaleAdd(scaleV, *allops2[ii], *this);
     }
   }
 
+  if (dmrginp.hamiltonian() == BCS) {
+    for (int ii = 0; ii < allops3.size(); ++ii) {
+      const int k = allops3[ii]->get_orbs()[0];
+      const int l = allops3[ii]->get_orbs()[1];
+
+      TensorOp CC2(k, l, 1, 1, spin, sym.getirrep(), k == l);
+      if (!CC2.empty) {
+        double scaleV = calcCompfactor(CD1, CC2, CD, v_cccd[b.get_integralIndex()]);
+        if (k != l) {
+          TensorOp CC2_commute(l, k, 1, 1, spin, sym.getirrep(), k==l);
+          double scaleV2 = calcCompfactor(CD1, CC2_commute, CD, v_cccd[b.get_integralIndex()]);
+          double parity = getCommuteParity(getSpinQuantum(l), getSpinQuantum(k), get_deltaQuantum()[1]);
+          scaleV += parity * scaleV2;
+        }
+
+        ScaleAdd(scaleV, *allops3[ii], *this);
+      }
+    }
+
+    for (int ii = 0; ii < allops4.size(); ++ii) {
+      const int k = allops4[ii]->get_orbs()[0];
+      const int l = allops4[ii]->get_orbs()[1];
+
+      TensorOp DD2(k, l, -1, -1, spin, sym.getirrep(), k == l);
+      if (!DD2.empty) {
+        double scaleV = calcCompfactor(CD1, DD2, CD, v_cccd[b.get_integralIndex()]);
+        if (k != l) {
+          TensorOp DD2_commute(l, k, -1, -1, spin, sym.getirrep(), k == l);
+          double scaleV2 = calcCompfactor(CD1, DD2_commute, CD, v_cccd[b.get_integralIndex()]);
+          double parity = getCommuteParity(-getSpinQuantum(l),-getSpinQuantum(k),get_deltaQuantum()[2]);
+          scaleV += parity*scaleV2;
+        }
+        ScaleAdd(scaleV, *allops4[ii], *this);
+      }
+    }
+  }
+
   //accumulateMultiThread(this, op_array, numthrds);
 }
 
@@ -1965,7 +2035,7 @@ SpinAdapted::operatorfunctions::TensorProduct(leftBlock, *op, *Overlap, &b, &(b.
     SpinQuantum hq(0, SpinSpace(0), IrrepSpace(0));
     const boost::shared_ptr<StackSparseMatrix> Overlap = leftBlock->get_op_rep(OVERLAP, hq);
     SpinAdapted::operatorfunctions::TensorProduct(leftBlock, *Overlap, *op, &b, &(b.get_stateInfo()), *this, 1.0);
-  }  
+  }
   // build DCcomp explicitely
   for (int kx = 0; kx < leftBlock->get_sites().size(); ++kx)
     for (int lx = 0; lx < rightBlock->get_sites().size(); ++lx) {
@@ -2095,7 +2165,7 @@ double SpinAdapted::StackDesCreComp::redMatrixElement(Csf c1, vector<Csf>& ladde
             } else {
 	          //TensorOp CK(k, 1), DL(l, -1);
 	          //TensorOp CD2 = CK.product(DL, spin, sym.getirrep());
-	          TensorOp CD2(k, l, 1, -1, spin, sym.getirrep(), k==l);
+	          TensorOp CD2(k, l, 1, -1, spin, sym.getirrep());
             if (!CD2.empty) {
 	            double MatElements = calcMatrixElements(c1, CD2, ladder[i], backupSlater1, backupSlater2, index) ;
 	            double factor = calcCompfactor(CD1, CD2, CD, *(b->get_twoInt()), b->get_integralIndex());
@@ -2133,16 +2203,15 @@ void SpinAdapted::StackDesDesComp::buildfromDesDes(StackSpinBlock& b)
   TensorOp CC1(i, j, 1, 1, (-deltaQuantum[0].get_s()).getirrep(), (-sym).getirrep(), i==j);
 
   std::vector<boost::shared_ptr<StackSparseMatrix> >  allops;
-  int numCC = 0;
   std::vector<double> scaleDD, scaleCD, scaleCC;
   // TODO resize scaleCC and scaleCD
   if (dmrginp.hamiltonian() == BCS) {
-    scaleCC.resize(b.get_op_array(CRE_CRE).get_size(), 0.);
-    scaleCD.resize(b.get_op_array(CRE_DES).get_size()*2, 0.);
+    scaleCC.resize(b.get_op_array(CRE_CRE).get_size()*2, 0.);
+    scaleCD.resize(b.get_op_array(CRE_DES).get_size()*4, 0.);
   }
   if (dmrginp.spinAdapted()) scaleDD.resize(b.get_op_array(CRE_CRE).get_size()*2, 0.);
   else scaleDD.resize(b.get_op_array(CRE_CRE).get_size(), 0.);
-
+  int numDD = 0, numCC = 0;
   if(!b.has(DES))  {
     for (int ii=0; ii<b.get_op_array(CRE_CRE).get_size(); ii++)
       for (int ji=0; ji<b.get_op_array(CRE_CRE).get_local_element(ii).size(); ji++) {
@@ -2150,8 +2219,8 @@ void SpinAdapted::StackDesDesComp::buildfromDesDes(StackSpinBlock& b)
 
         if (b.get_op_array(CRE_CRE).get_local_element(ii)[ji]->get_deltaQuantum(0) == -deltaQuantum[0]) {
           const int k = b.get_op_array(CRE_CRE).get_local_element(ii)[ji]->get_orbs()[0];
-          const int l = b.get_op_array(CRE_CRE).get_local_element(ii)[ji]->get_orbs()[1]; 
-
+          const int l = b.get_op_array(CRE_CRE).get_local_element(ii)[ji]->get_orbs()[1];
+ 
           //TensorOp DK(k,-1), DL(l,-1);
           //TensorOp DD2 = DK.product(DL, spin, sym.getirrep(), k==l);
           TensorOp DD2(k, l, -1, -1, spin, sym.getirrep(), k==l);
@@ -2180,7 +2249,7 @@ void SpinAdapted::StackDesDesComp::buildfromDesDes(StackSpinBlock& b)
             //CK = TensorOp(k,1); CL = TensorOp(l,1);
             //CC2 = CL.product(CK, spin, sym.getirrep(), k == l);
             CC2 = TensorOp(l, k, 1, 1, spin, sym.getirrep(), k==l);
-            double parity = getCommuteParity(getSpinQuantum(k), getSpinQuantum(l), get_deltaQuantum()[0]);
+            double parity = getCommuteParity(getSpinQuantum(k), getSpinQuantum(l), get_deltaQuantum()[2]);
             if (k != l)
               scaleCC[allops.size()] += parity * calcCompfactor(CC1, CC2, DD, v_cccc[b.get_integralIndex()]);
           }
@@ -2189,10 +2258,6 @@ void SpinAdapted::StackDesDesComp::buildfromDesDes(StackSpinBlock& b)
         if (nonzero) allops.push_back(b.get_op_array(CRE_CRE).get_local_element(ii)[ji]);
       }
   } else {
-    if (dmrginp.hamiltonian() == BCS) {
-      pout << "buildfromDesDes with DES in BCS not implemented" << endl;
-      abort();
-    }
     for (int ii=0; ii<b.get_op_array(DES_DES).get_size(); ii++)
       for (int ji=0; ji<b.get_op_array(DES_DES).get_local_element(ii).size(); ji++) 
         if (b.get_op_array(DES_DES).get_local_element(ii)[ji]->get_deltaQuantum(0) == deltaQuantum[0]) {
@@ -2212,6 +2277,25 @@ void SpinAdapted::StackDesDesComp::buildfromDesDes(StackSpinBlock& b)
               scaleDD[allops.size()-1] += parity * calcCompfactor(CC1, DD2, DD, *(b.get_twoInt()), b.get_integralIndex());
           }
         }
+    numDD = allops.size();
+
+    if (dmrginp.hamiltonian() == BCS)
+      for (int ii=0; ii<b.get_op_array(CRE_CRE).get_size(); ii++)
+        for (int ji=0; ji<b.get_op_array(CRE_CRE).get_local_element(ii).size(); ji++) {
+          if (b.get_op_array(CRE_CRE).get_local_element(ii)[ji]->get_deltaQuantum(0).get_s() == deltaQuantum[0].get_s()) { // then we need CC
+            allops.push_back(b.get_op_array(CRE_CRE).get_local_element(ii)[ji]);
+            const int k = b.get_op_array(CRE_CRE).get_local_element(ii)[ji]->get_orbs()[0];
+            const int l = b.get_op_array(CRE_CRE).get_local_element(ii)[ji]->get_orbs()[1]; 
+            TensorOp CC2(k, l, 1, 1, spin, sym.getirrep(), k==l);
+            if (!CC2.empty) {
+              scaleCC[allops.size()-1] = calcCompfactor(CC1, CC2, DD, v_cccc[b.get_integralIndex()]);
+              CC2 = TensorOp(l, k, 1, 1, spin, sym.getirrep(), k==l);
+              double parity = getCommuteParity(getSpinQuantum(k), getSpinQuantum(l), get_deltaQuantum()[1]);
+              if (k != l)
+                scaleCC[allops.size()-1] += parity * calcCompfactor(CC1, CC2, DD, v_cccc[b.get_integralIndex()]);
+            }
+          }
+        }
   }
   numCC = allops.size();
 
@@ -2224,18 +2308,31 @@ void SpinAdapted::StackDesDesComp::buildfromDesDes(StackSpinBlock& b)
           const int k = allops.back()->get_orbs()[0];
           const int l = allops.back()->get_orbs()[1];
 
-          //TensorOp CK(k,1), DL(l,-1);
-          //TensorOp CD2 = CK.product(DL, spin, sym.getirrep());
           TensorOp CD2(k, l, 1, -1, spin, sym.getirrep());
           if (!CD2.empty)
-            scaleCD[2*(allops.size()-1)] = calcCompfactor(CC1, CD2, DD, v_cccd[b.get_integralIndex()]);
+            scaleCD[2*(allops.size()-numCC-1)] = calcCompfactor(CC1, CD2, DD, v_cccd[b.get_integralIndex()]);
+
+          if (!b.has(DES)) {
+            CD2 = TensorOp(l, k, 1, -1, spin, sym.getirrep());
+            if (k!=l && !CD2.empty)
+              scaleCD[2*(allops.size()-numCC-1)+1] = calcCompfactor(CC1, CD2, DD, v_cccd[b.get_integralIndex()]);
+          }
 
-          //TensorOp CL(l,1), DK(k,-1);
-          //CD2 = CL.product(DK, spin, sym.getirrep());
-          CD2 = TensorOp(l, k, 1, -1, spin, sym.getirrep());
-          if (k!=l && !CD2.empty)
-            scaleCD[2*(allops.size()-1)+1] = calcCompfactor(CC1, CD2, DD, v_cccd[b.get_integralIndex()]);
         }
+    if (b.has(DES)) {
+      for (int ii = 0; ii < b.get_op_array(DES_CRE).get_size(); ++ii)
+        for (int ji = 0; ji < b.get_op_array(DES_CRE).get_local_element(ii).size(); ++ji)
+          if (b.get_op_array(DES_CRE).get_local_element(ii)[ji]->get_deltaQuantum(0).get_s() == deltaQuantum[0].get_s() ||
+              b.get_op_array(CRE_DES).get_local_element(ii)[ji]->get_deltaQuantum(0).get_s() == -deltaQuantum[0].get_s()) {
+            allops.push_back(b.get_op_array(DES_CRE).get_local_element(ii)[ji]);
+            const int l = allops.back()->get_orbs()[0];
+            const int k = allops.back()->get_orbs()[1];
+
+            TensorOp CD2(k, l, 1, -1, spin, sym.getirrep());
+            if (k!=l && !CD2.empty) // can't be spin adapted
+              scaleCD[2*(allops.size()-numCC-1)] = calcCompfactor(CC1, CD2, DD, v_cccd[b.get_integralIndex()]);
+          }
+    }
   }
 
   const int quantaSz = b.get_braStateInfo().quanta.size () * b.get_ketStateInfo().quanta.size();
@@ -2265,28 +2362,34 @@ void SpinAdapted::StackDesDesComp::buildfromDesDes(StackSpinBlock& b)
           for (int row=0; row < nrows; ++row)
             DAXPY(ncols, scaling*scaleDD[opindex], &(allops[opindex]->operator_element(rQ,lQ)(1, row+1)), nrows, &(this->operator_element(lQ, rQ)(row+1, 1)), 1);
         }
-      } else { // operator DD
-        if (allowed(lQ, rQ) && allops[opindex]->allowed(lQ,rQ) && fabs(scaleDD[opindex])> TINY) {
-          MatrixScaleAdd(scaleDD[opindex], allops[opindex]->operator_element(lQ,rQ), this->operator_element(lQ, rQ));
+        if (dmrginp.hamiltonian() == BCS && allowed(lQ, rQ) 
+            && allops[opindex]->allowed(lQ,rQ) && fabs(scaleCC[opindex]) > TINY) {
+          MatrixScaleAdd(scaleCC[opindex], allops[opindex]->operator_element(lQ,rQ), this->operator_element(lQ, rQ));
+        }
+      } else {
+        // operator DD
+        if (opindex < numDD) {
+          if (allowed(lQ, rQ) && allops[opindex]->allowed(lQ,rQ) && fabs(scaleDD[opindex])> TINY) {
+            MatrixScaleAdd(scaleDD[opindex], allops[opindex]->operator_element(lQ,rQ), this->operator_element(lQ, rQ));
+          }
+        } else {
+          if (allowed(lQ, rQ) && allops[opindex]->allowed(lQ,rQ) && fabs(scaleCC[opindex]) > TINY) {
+            MatrixScaleAdd(scaleCC[opindex], allops[opindex]->operator_element(lQ,rQ), this->operator_element(lQ, rQ));
+          }
         }
-      }
-
-      if (dmrginp.hamiltonian() == BCS && allowed(lQ, rQ) 
-          && allops[opindex]->allowed(lQ,rQ) && fabs(scaleCC[opindex]) > TINY) {
-        MatrixScaleAdd(scaleCC[opindex], allops[opindex]->operator_element(lQ,rQ), this->operator_element(lQ, rQ));
       }
     } else { // CkDl and ClDk
       const int k = allops[opindex]->get_orbs()[0];    
       const int l = allops[opindex]->get_orbs()[1];
-      if (allowed(lQ,rQ) && allops[opindex]->allowed(lQ,rQ) && fabs(scaleCD[2*opindex]) > TINY)
-        MatrixScaleAdd(scaleCD[2*opindex], allops[opindex]->operator_element(lQ,rQ), this->operator_element(lQ, rQ));
+      if (allowed(lQ,rQ) && allops[opindex]->allowed(lQ,rQ) && fabs(scaleCD[2*(opindex-numCC)]) > TINY)
+        MatrixScaleAdd(scaleCD[2*(opindex-numCC)], allops[opindex]->operator_element(lQ,rQ), this->operator_element(lQ, rQ));
 
-      if (!b.has(DES) && k != l && allowed(lQ,rQ) && allops[opindex]->allowed(rQ,lQ) && fabs(scaleCD[2*opindex+1]) > TINY) {
+      if (!b.has(DES) && k != l && allowed(lQ,rQ) && allops[opindex]->allowed(rQ,lQ) && fabs(scaleCD[2*(opindex-numCC)+1]) > TINY) {
         double scaling = getStandAlonescaling(-(allops[opindex]->get_deltaQuantum(0)), b.get_braStateInfo().quanta[lQ], b.get_ketStateInfo().quanta[rQ]);
         int nrows = operator_element(lQ, rQ).Nrows();
         int ncols = operator_element(lQ, rQ).Ncols();
         for (int row=0; row < nrows; ++row)
-          DAXPY(ncols, scaling*scaleCD[2*opindex+1], &(allops[opindex]->operator_element(rQ,lQ)(1, row+1)), nrows, &(this->operator_element(lQ, rQ)(row+1, 1)), 1);
+          DAXPY(ncols, scaling*scaleCD[2*(opindex-numCC)+1], &(allops[opindex]->operator_element(rQ,lQ)(1, row+1)), nrows, &(this->operator_element(lQ, rQ)(row+1, 1)), 1);
       }
     }
   }
@@ -2397,7 +2500,7 @@ void SpinAdapted::StackDesDesComp::build(StackMatrix& m, int row, int col, const
           double scaleV2 = calcCompfactor(CC1, DD2, DD, *(b.get_twoInt()), b.get_integralIndex());
 
           if ((fabs(scaleV2)+fabs(scaleV)) > dmrginp.twoindex_screen_tol()) {
-	          if (leftBlock->has(DES)) {
+	          if (leftBlock->has(DES) && rightBlock->has(DES)) {
 	            boost::shared_ptr<StackSparseMatrix> op1 = leftBlock->get_op_rep(DES, -getSpinQuantum(k), k);
 	            boost::shared_ptr<StackSparseMatrix> op2 = rightBlock->get_op_rep(DES, -getSpinQuantum(l), l);
 	            
@@ -2535,7 +2638,7 @@ void SpinAdapted::StackDesDesComp::build(StackMatrix& m, int row, int col, const
           double scaleV2 = calcCompfactor(CC1, DD2, DD, *(b.get_twoInt()), b.get_integralIndex());
           
           if ((fabs(scaleV2)+fabs(scaleV)) > dmrginp.twoindex_screen_tol()) {
-	          if (leftBlock->has(DES)) {
+	          if (leftBlock->has(DES) && rightBlock->has(DES)) {
 	            boost::shared_ptr<StackSparseMatrix> op1 = leftBlock->get_op_rep(DES, -getSpinQuantum(k), k);
 	            boost::shared_ptr<StackSparseMatrix> op2 = rightBlock->get_op_rep(DES, -getSpinQuantum(l), l);
 	            
@@ -2609,7 +2712,7 @@ void SpinAdapted::StackDesDesComp::build(StackMatrix& m, int row, int col, const
         ccomp -> deallocate();
       }
 
-      if (leftBlock->has(DES)) {
+      if (rightBlock->has(DES)) {
         boost::shared_ptr<StackSparseMatrix> op2 = rightBlock->get_op_rep(DES, -getSpinQuantum(l), l);
         SpinAdapted::operatorfunctions::TensorProductElement(leftBlock, *dcomp, *op2, &b, &(b.get_stateInfo()), *this, m, row, col, 1.);
       } else {
@@ -2818,7 +2921,7 @@ void SpinAdapted::StackDesDesComp::build(const StackSpinBlock& b)
           double scaleV2 = calcCompfactor(CC1, DD2, DD, *(b.get_twoInt()), b.get_integralIndex());
 
           if ((fabs(scaleV2)+fabs(scaleV)) > dmrginp.twoindex_screen_tol()) {
-	          if (leftBlock->has(DES)) {
+	          if (leftBlock->has(DES) && rightBlock->has(DES)) {
 	            boost::shared_ptr<StackSparseMatrix> op1 = leftBlock->get_op_rep(DES, -getSpinQuantum(k), k);
 	            boost::shared_ptr<StackSparseMatrix> op2 = rightBlock->get_op_rep(DES, -getSpinQuantum(l), l);
 	            
@@ -2938,7 +3041,7 @@ void SpinAdapted::StackDesDesComp::build(const StackSpinBlock& b)
           double scaleV2 = calcCompfactor(CC1, DD2, DD, *(b.get_twoInt()), b.get_integralIndex());
           
           if ((fabs(scaleV2)+fabs(scaleV)) > dmrginp.twoindex_screen_tol()) {
-	          if (leftBlock->has(DES)) {
+	          if (leftBlock->has(DES) && rightBlock->has(DES)) {
 	            boost::shared_ptr<StackSparseMatrix> op1 = leftBlock->get_op_rep(DES, -getSpinQuantum(k), k);
 	            boost::shared_ptr<StackSparseMatrix> op2 = rightBlock->get_op_rep(DES, -getSpinQuantum(l), l);
 	            
@@ -3012,7 +3115,7 @@ void SpinAdapted::StackDesDesComp::build(const StackSpinBlock& b)
         ccomp -> deallocate();
       }
 
-      if (leftBlock->has(DES)) {
+      if (rightBlock->has(DES)) {
         boost::shared_ptr<StackSparseMatrix> op2 = rightBlock->get_op_rep(DES, -getSpinQuantum(l), l);
         SpinAdapted::operatorfunctions::TensorProduct(leftBlock, *dcomp, *op2, &b, &(b.get_stateInfo()), *this, 1.);
       } else {
@@ -3212,6 +3315,7 @@ double SpinAdapted::StackDesDesComp::redMatrixElement(Csf c1, vector<Csf>& ladde
 
 void SpinAdapted::StackCreCreComp::buildfromCreCre(StackSpinBlock& b) 
 {
+
  int spin = deltaQuantum[0].get_s().getirrep();
  IrrepSpace sym = deltaQuantum[0].get_symm();
  
@@ -3226,19 +3330,36 @@ void SpinAdapted::StackCreCreComp::buildfromCreCre(StackSpinBlock& b)
  //TensorOp DD1 = D.product(D2, (-deltaQuantum[0].get_s()).getirrep(), (-sym).getirrep(), i==j);
  TensorOp DD1(i, j, -1, -1, (-deltaQuantum[0].get_s()).getirrep(), (-sym).getirrep(), i==j);
  
- std::vector<boost::shared_ptr<StackSparseMatrix> >  allops;
+ std::vector<boost::shared_ptr<StackSparseMatrix> >  allops1, allops2, allops3, allops4;
 
  for (int ii=0; ii<b.get_op_array(CRE_CRE).get_size(); ii++)
    for (int ji=0; ji<b.get_op_array(CRE_CRE).get_local_element(ii).size(); ji++) 
      if (b.get_op_array(CRE_CRE).get_local_element(ii)[ji]->get_deltaQuantum(0) == deltaQuantum[0])
-       allops.push_back(b.get_op_array(CRE_CRE).get_local_element(ii)[ji]);
+       allops1.push_back(b.get_op_array(CRE_CRE).get_local_element(ii)[ji]);
+
+  if (dmrginp.hamiltonian() == BCS) {
+    for (int ii=0; ii<b.get_op_array(DES_DES).get_size(); ii++)
+      for (int ji=0; ji<b.get_op_array(DES_DES).get_local_element(ii).size(); ji++) 
+        if (b.get_op_array(DES_DES).get_local_element(ii)[ji]->get_deltaQuantum(0) == deltaQuantum[2])
+          allops2.push_back(b.get_op_array(DES_DES).get_local_element(ii)[ji]);
+
+    for (int ii=0; ii<b.get_op_array(CRE_DES).get_size(); ii++)
+      for (int ji=0; ji<b.get_op_array(CRE_DES).get_local_element(ii).size(); ji++) 
+        if (b.get_op_array(CRE_DES).get_local_element(ii)[ji]->get_deltaQuantum(0) == deltaQuantum[1])
+          allops3.push_back(b.get_op_array(CRE_DES).get_local_element(ii)[ji]);
+
+    for (int ii=0; ii<b.get_op_array(DES_CRE).get_size(); ii++)
+      for (int ji=0; ji<b.get_op_array(DES_CRE).get_local_element(ii).size(); ji++) 
+        if (b.get_op_array(DES_CRE).get_local_element(ii)[ji]->get_deltaQuantum(0) == deltaQuantum[1])
+          allops4.push_back(b.get_op_array(DES_CRE).get_local_element(ii)[ji]);
+  }
 
  //#pragma omp parallel for schedule(dynamic)
- for (int ii = 0; ii<allops.size(); ii++) {
-   const int k = allops[ii]->get_orbs()[0];
-   const int l = allops[ii]->get_orbs()[1];
-   
-   //TensorOp CK(k, 1), CL(l, 1);      
+ for (int ii = 0; ii<allops1.size(); ii++) {
+   const int k = allops1[ii]->get_orbs()[0];
+   const int l = allops1[ii]->get_orbs()[1];
+
+   //TensorOp CK(k, 1), CL(l, 1);
    //TensorOp CC2 = CK.product(CL, spin, sym.getirrep(), k==l);
    TensorOp CC2(k, l, 1, 1, spin, sym.getirrep(), k==l);
    if (!CC2.empty) {
@@ -3253,11 +3374,55 @@ void SpinAdapted::StackCreCreComp::buildfromCreCre(StackSpinBlock& b)
      if (k != l)
        scaleV += parity*scaleV2;
 
-     ScaleAdd(scaleV, *allops[ii], *this);
-     
+     ScaleAdd(scaleV, *allops1[ii], *this);
+
    }
  }
 
+  if (dmrginp.hamiltonian() == BCS) {
+    for (int ii = 0; ii < allops2.size(); ++ii) {
+      const int k = allops2[ii]->get_orbs()[0];
+      const int l = allops2[ii]->get_orbs()[1];
+
+      TensorOp DD2(k, l, -1, -1, spin, sym.getirrep(), k == l);
+      if (!DD2.empty) {
+        double scaleV = calcCompfactor(DD2, DD1, DD, v_cccc[b.get_integralIndex()]);
+        TensorOp DD2_commute(l, k, -1, -1, spin, sym.getirrep(), k == l);
+        double scaleV2 = calcCompfactor(DD2_commute, DD1, DD, v_cccc[b.get_integralIndex()]);
+        double parity = getCommuteParity(-getSpinQuantum(k), -getSpinQuantum(l), get_deltaQuantum()[2]);
+
+        ScaleAdd(scaleV+parity*scaleV2, *allops2[ii], *this);
+      }
+    }
+
+    for (int ii = 0; ii < allops3.size(); ++ii) {
+      const int k = allops3[ii]->get_orbs()[0];
+      const int l = allops3[ii]->get_orbs()[1];
+
+      TensorOp CD2(k, l, 1, -1, spin, sym.getirrep());
+      if (!CD2.empty) {
+        double scaleV =  calcCompfactor(DD1, CD2, DD, v_cccd[b.get_integralIndex()]);
+        ScaleAdd(scaleV, *allops3[ii], *this);
+      }
+    }
+
+    for (int ii = 0; ii < allops4.size(); ++ii) {
+      const int l = allops4[ii]->get_orbs()[0];
+      const int k = allops4[ii]->get_orbs()[1];
+
+      if (k == l) continue;
+
+      TensorOp CD2(k, l, 1, -1, spin, sym.getirrep());
+      if (!CD2.empty) {
+        double scaleV = calcCompfactor(DD1, CD2, DD, v_cccd[b.get_integralIndex()]);
+        // FIXME does this need to be used for nonspinadapted as well?
+        if (dmrginp.spinAdapted()) {
+          scaleV *= getCommuteParity(getSpinQuantum(l), getSpinQuantum(k), get_deltaQuantum()[1]);
+        }
+        ScaleAdd(scaleV, *allops4[ii], *this);
+      }
+    }
+  }
  //accumulateMultiThread(this, op_array, numthrds);
 
 }
@@ -3346,12 +3511,12 @@ void SpinAdapted::StackCreCreComp::build(const StackSpinBlock& b)
         //TensorOp DD2 = DK.product(DL, spin, sym.getirrep(), k==l);
         TensorOp DD2(k, l, -1, -1, spin, sym.getirrep(), k==l);
         if (!DD2.empty) {
-          double scaleV = calcCompfactor(DD1, DD2, DD, v_cccc[b.get_integralIndex()]);
+          double scaleV = calcCompfactor(DD2, DD1, DD, v_cccc[b.get_integralIndex()]);
           //DK = TensorOp(k,-1);
           //DL = TensorOp(l,-1);
           //TensorOp DD2_commute = DL.product(DK, spin, sym.getirrep(), k==l);
           TensorOp DD2_commute(l, k, -1, -1, spin, sym.getirrep(), k==l);
-          double scaleV2 = calcCompfactor(DD1, DD2_commute, DD, v_cccc[b.get_integralIndex()]);
+          double scaleV2 = calcCompfactor(DD2_commute, DD1, DD, v_cccc[b.get_integralIndex()]);
           if (leftBlock->get_op_array(CRE).has(k) && rightBlock->get_op_array(CRE).has(l) && (fabs(scaleV2)+fabs(scaleV)) > dmrginp.twoindex_screen_tol()) {
             boost::shared_ptr<StackSparseMatrix> op1 = leftBlock->get_op_rep(DES, -getSpinQuantum(k), k);
             boost::shared_ptr<StackSparseMatrix> op2 = rightBlock->get_op_rep(DES, -getSpinQuantum(l), l);
@@ -3521,7 +3686,7 @@ void SpinAdapted::StackCreDesDesComp::build(const StackSpinBlock& b)
       {
 	FUNCTOR f = boost::bind(&stackopxop::dxcdcomp, otherBlock, _1, &b, k, this, 1.0); 
 	for_all_singlethread(loopBlock->get_op_array(DES), f);
-	
+
 	f = boost::bind(&stackopxop::cxddcomp, otherBlock, _1, &b, k, this, 2.0);
 	for_all_singlethread(loopBlock->get_op_array(CRE), f);
         
@@ -3736,6 +3901,7 @@ void SpinAdapted::StackCreCreDesComp::build(StackMatrix& m, int row, int col, co
 	FUNCTOR f = boost::bind(&stackopxop::cxcdcompElement, otherBlock, _1, &b, k, this, m, row, col, 1.0); 
 	for_all_singlethread(loopBlock->get_op_array(CRE), f);
 
+  
 	f = boost::bind(&stackopxop::dxcccompElement, otherBlock, _1, &b, k, this, m, row, col, 2.0); // factor of 2.0 because CCcomp_{ij} = -CCcomp_{ji}
 	for_all_singlethread(loopBlock->get_op_array(CRE), f);
 
@@ -3785,7 +3951,6 @@ void SpinAdapted::StackCreCreDesComp::build(const StackSpinBlock& b)
 	SpinAdapted::operatorfunctions::TensorProduct(leftBlock, *op, *Overlap, &b, &(b.get_stateInfo()), *this, 1.0);
       }
     }
-
   if (rightBlock->get_sites().size() == 0) {
     //this is a special case where the right block is just a dummy block to make the effective wavefunction have spin 0
     return;
@@ -4303,7 +4468,7 @@ void SpinAdapted::StackHam::build(const StackSpinBlock& b)
   const boost::shared_ptr<StackSparseMatrix> Overlap = leftBlock->get_op_rep(OVERLAP, hq);
   SpinAdapted::operatorfunctions::TensorProduct(leftBlock, *Overlap, *op, &b, &(b.get_stateInfo()), *this, 1.0);
 
-  // CCD_A*D_B + CCD_B*D_A + c.c. 
+  // CCD_A*D_B + CCD_B*D_A + c.c.
   FUNCTOR f = boost::bind(&stackopxop::cxcddcomp, leftBlock, _1, &b, op_array); 
   for_all_singlethread(rightBlock->get_op_array(CRE), f);
 
diff --git a/Stack_op_components.C b/Stack_op_components.C
index 093070c..cd2a068 100644
--- a/Stack_op_components.C
+++ b/Stack_op_components.C
@@ -528,11 +528,18 @@ namespace SpinAdapted {
       vec.resize(spinvec.size());
       vector<int> orbs(2); orbs[0] = i; orbs[1] = j;
       for (int j=0; j<spinvec.size(); j++) {
-	vec[j]=boost::shared_ptr<StackCreDesComp>(new StackCreDesComp);
-	vec[j]->set_orbs() = orbs;
-	vec[j]->set_initialised() = true;
-	vec[j]->set_fermion() = false;
-	vec[j]->set_deltaQuantum(1, spinvec[j]);
+        vec[j]=boost::shared_ptr<StackCreDesComp>(new StackCreDesComp);
+        vec[j]->set_orbs() = orbs;
+        vec[j]->set_initialised() = true;
+        vec[j]->set_fermion() = false;
+	      if (dmrginp.hamiltonian() == BCS) {
+	        vec[j]->resize_deltaQuantum(3);
+	        vec[j]->set_deltaQuantum(0) = spinvec[j];
+	        vec[j]->set_deltaQuantum(1) = SpinQuantum(2, spinvec[j].get_s(), spinvec[j].get_symm());
+	        vec[j]->set_deltaQuantum(2) = SpinQuantum(-2, spinvec[j].get_s(), spinvec[j].get_symm());
+        } else {
+          vec[j]->set_deltaQuantum(1, spinvec[j]);
+        }
       }
     }
   
@@ -628,11 +635,18 @@ namespace SpinAdapted {
       vec.resize(spinvec.size());
       vector<int> orbs(2); orbs[0] = i; orbs[1] = j;
       for (int j=0; j<spinvec.size(); j++) {
-	vec[j]=boost::shared_ptr<StackDesCreComp>(new StackDesCreComp);
-	vec[j]->set_orbs() = orbs;
-	vec[j]->set_initialised() = true;
-	vec[j]->set_fermion() = false;
-	vec[j]->set_deltaQuantum(1, spinvec[j]);
+        vec[j]=boost::shared_ptr<StackDesCreComp>(new StackDesCreComp);
+        vec[j]->set_orbs() = orbs;
+        vec[j]->set_initialised() = true;
+        vec[j]->set_fermion() = false;
+        if (dmrginp.hamiltonian() == BCS) {
+          vec[j]->resize_deltaQuantum(3);
+          vec[j]->set_deltaQuantum(0) = spinvec[j];
+          vec[j]->set_deltaQuantum(1) = SpinQuantum(2, spinvec[j].get_s(), spinvec[j].get_symm());
+          vec[j]->set_deltaQuantum(2) = SpinQuantum(-2, spinvec[j].get_s(), spinvec[j].get_symm());
+        } else {
+          vec[j]->set_deltaQuantum(1, spinvec[j]);
+        }
       }
     }
   
@@ -729,11 +743,19 @@ namespace SpinAdapted {
       vec.resize(spinvec.size());
       vector<int> orbs(2); orbs[0] = i; orbs[1] = j;
       for (int j=0; j<spinvec.size(); j++) {
-	vec[j]=boost::shared_ptr<StackDesDesComp>(new StackDesDesComp);
-	vec[j]->set_orbs() = orbs;
-	vec[j]->set_initialised() = true;
-	vec[j]->set_fermion() = false;
-	vec[j]->set_deltaQuantum(1, -spinvec[j]);
+        vec[j]=boost::shared_ptr<StackDesDesComp>(new StackDesDesComp);
+        vec[j]->set_orbs() = orbs;
+        vec[j]->set_initialised() = true;
+        vec[j]->set_fermion() = false;
+
+        if (dmrginp.hamiltonian() == BCS) {
+          vec[j]->resize_deltaQuantum(3);
+	        vec[j]->set_deltaQuantum(0) = -spinvec[j];
+	        vec[j]->set_deltaQuantum(1) = -SpinQuantum(0, spinvec[j].get_s(), spinvec[j].get_symm());
+	        vec[j]->set_deltaQuantum(2) = -SpinQuantum(-2, spinvec[j].get_s(), spinvec[j].get_symm());
+        } else {
+          vec[j]->set_deltaQuantum(1, -spinvec[j]);
+        }
       }
     }
   
@@ -829,11 +851,18 @@ namespace SpinAdapted {
       vec.resize(spinvec.size());
       vector<int> orbs(2); orbs[0] = i; orbs[1] = j;
       for (int j=0; j<spinvec.size(); j++) {
-	vec[j]=boost::shared_ptr<StackCreCreComp>(new StackCreCreComp);
-	vec[j]->set_orbs() = orbs;
-	vec[j]->set_initialised() = true;
-	vec[j]->set_fermion() = false;
-	vec[j]->set_deltaQuantum(1, spinvec[j]);
+        vec[j]=boost::shared_ptr<StackCreCreComp>(new StackCreCreComp);
+        vec[j]->set_orbs() = orbs;
+        vec[j]->set_initialised() = true;
+        vec[j]->set_fermion() = false;
+        if (dmrginp.hamiltonian() == BCS) {
+	        vec[j]->resize_deltaQuantum(3);          
+	        vec[j]->set_deltaQuantum(0) = spinvec[j];
+	        vec[j]->set_deltaQuantum(1) = SpinQuantum(0, spinvec[j].get_s(), spinvec[j].get_symm());
+	        vec[j]->set_deltaQuantum(2) = SpinQuantum(-2, spinvec[j].get_s(), spinvec[j].get_symm());
+        } else {
+          vec[j]->set_deltaQuantum(1, spinvec[j]);
+        }
       }
     }
   
diff --git a/Stackspinblock.C b/Stackspinblock.C
index 9b51833..7635b7b 100644
--- a/Stackspinblock.C
+++ b/Stackspinblock.C
@@ -77,6 +77,9 @@ void StackSpinBlock::printOperatorSummary()
 #ifndef SERIAL
   mpi::communicator world;
 
+  //pout << "Printing Operators of this Block" << endl;
+  //pout << *this << endl;
+
   if (mpigetrank() != 0) {
     for (std::map<opTypes, boost::shared_ptr< StackOp_component_base> >::const_iterator it = ops.begin(); it != ops.end(); ++it)
       sendobject(it->second->get_size(), 0);
@@ -90,7 +93,7 @@ void StackSpinBlock::printOperatorSummary()
       }
       else
       {
-         p2out << "\t\t\t " << it->second->size()<<" :  "<<it->second->get_op_string()<<"  Virtual Operators  ";      
+         p2out << "\t\t\t " << it->second->size()<<" :  "<<it->second->get_op_string()<<"  Virtual Operators  ";
       }
 
       vector<int> numops(calc.size(), 0);
@@ -102,6 +105,14 @@ void StackSpinBlock::printOperatorSummary()
          p2out << " " << numops[proc]<<"  ";
       }
       p2out << endl;
+      //if (it->second->get_op_string() == "STACKHAM" || it->second->get_op_string() == "STACKCRECREDES_COMP" || it->second->get_op_string() == "STACKDESDES_COMP" || it->second->get_op_string() == "STACKCREDES_COMP" || it->second->get_op_string() == "STACKCRE" || it->second->get_op_string() == "STACKCRECRE" || it->second->get_op_string() == "STACKCREDES" || it->second->get_op_string() == "STACKCRECRE" || it->second->get_op_string() == "STACKOVERLAP") {
+      //if (it->second->get_op_string() == "STACKCREDES_COMP" || it->second->get_op_string() == "STACKDESCRE_COMP") {
+      //  if (it->second->is_core()) {
+      //    for (int i = 0; i < it->second->size(); ++i) {
+      //      pout << *it->second->get_local_element(i)[0] << endl;
+      //    }
+      //  }
+      //}
     }
   }
 #else
@@ -964,6 +975,85 @@ int procWithMinOps(std::vector<boost::shared_ptr<StackSparseMatrix> >& allops)
   return minproc;
 }
 
+std::vector<boost::shared_ptr<StackSparseMatrix>> StackSpinBlock::prebuild(int num_threads) const {
+  std::vector<boost::shared_ptr<StackSparseMatrix>> comps;
+  if (dmrginp.hamiltonian() == HUBBARD) return comps;
+
+  StackSpinBlock* loopBlock=(leftBlock->is_loopblock()) ? leftBlock : rightBlock;
+  StackSpinBlock* otherBlock = loopBlock == leftBlock ? rightBlock : leftBlock;
+
+  if (otherBlock->get_rightBlock() == 0) return comps;
+
+  for (int i = 0; i < loopBlock->get_op_array(CRE_DES).get_size(); ++i)
+    for (int j = 0; j < loopBlock->get_op_array(CRE_DES).get_local_element(i).size(); ++j) {
+      boost::shared_ptr<StackSparseMatrix> op1 = loopBlock->get_op_array(CRE_DES).get_local_element(i)[j];
+      int I = op1->get_orbs(0);
+      int J = op1->get_orbs(1);
+      if (otherBlock->get_op_array(CRE_DESCOMP).has_local_index(I, J)) {
+        boost::shared_ptr<StackSparseMatrix> op2;
+        if (!dmrginp.spinAdapted() || dmrginp.hamiltonian() == BCS)
+          op2 = otherBlock->get_op_array(CRE_DESCOMP).get_element(I, J).at(0);
+        else
+          op2 = otherBlock->get_op_rep(CRE_DESCOMP, -op1->get_deltaQuantum()[0], I, J);
+
+        if (!op2->memoryUsed()) comps.push_back(op2);
+
+        if (otherBlock->has(DES_CRECOMP)) {
+          if (!dmrginp.spinAdapted() || dmrginp.hamiltonian() == BCS)
+            op2 = otherBlock->get_op_array(DES_CRECOMP).get_element(I, J).at(0);
+          else
+            op2 = otherBlock->get_op_rep(DES_CRECOMP, op1->get_deltaQuantum()[0], I, J);
+
+          if (!op2->memoryUsed()) comps.push_back(op2);
+        }
+      }
+    }
+
+  for (int i = 0; i < loopBlock->get_op_array(CRE_CRE).get_size(); ++i)
+    for (int j = 0; j < loopBlock->get_op_array(CRE_CRE).get_local_element(i).size(); ++j) {
+      boost::shared_ptr<StackSparseMatrix> op1 = loopBlock->get_op_array(CRE_CRE).get_local_element(i)[j];
+      int I = op1->get_orbs(0);
+      int J = op1->get_orbs(1);
+      if (otherBlock->get_op_array(DES_DESCOMP).has_local_index(I, J)) {
+        boost::shared_ptr<StackSparseMatrix> op2;
+        if (!dmrginp.spinAdapted() || dmrginp.hamiltonian() == BCS)
+          op2 = otherBlock->get_op_array(DES_DESCOMP).get_element(I, J).at(0);
+        else
+          op2 = otherBlock->get_op_rep(DES_DESCOMP, -op1->get_deltaQuantum()[0], I, J);
+
+        if (!op2->memoryUsed()) comps.push_back(op2);
+
+        if (otherBlock->has(CRE_CRECOMP)) {
+          if (!dmrginp.spinAdapted() || dmrginp.hamiltonian() == BCS)
+            op2 = otherBlock->get_op_array(CRE_CRECOMP).get_element(I, J).at(0);
+          else
+            op2 = otherBlock->get_op_rep(CRE_CRECOMP, op1->get_deltaQuantum()[0], I, J);
+
+          if (!op2->memoryUsed()) comps.push_back(op2);
+        }
+      }
+    }
+
+  // Now allocate comps
+  double additionalMem = 0.;
+  for (int i = 0; i < comps.size(); ++i) {
+    comps.at(i)->allocate(otherBlock->get_braStateInfo(), otherBlock->get_ketStateInfo());
+    additionalMem += comps.at(i) -> memoryUsed();
+  }
+  p3out << "additional memory used for pre-build complimentary operators " 
+    << additionalMem*8/1.e9 << " GB" << endl;
+
+  SplitStackmem();
+
+  // omp parallel build
+#pragma omp parallel for schedule(dynamic) 
+  for (int i = 0; i < comps.size(); ++i)
+    comps.at(i)->build(*otherBlock);
+  MergeStackmem();
+
+  return comps;
+}
+
 void StackSpinBlock::multiplyH(StackWavefunction& c, StackWavefunction* v, int num_threads) const
 {
 
@@ -1159,7 +1249,7 @@ void StackSpinBlock::multiplyH(StackWavefunction& c, StackWavefunction* v, int n
     }
   }
 
-  
+
   for (int i = 0; i<numthrds; i++)  {
     if (collected[i]==2) {
       if (loopBlock == get_leftBlock()) v_array[i].CollectQuantaAlongRows(*loopBlock->get_ketStateInfo().unCollectedStateInfo, otherBlock->get_ketStateInfo());
diff --git a/Stackspinblock.h b/Stackspinblock.h
index 35ba1f4..ffab087 100644
--- a/Stackspinblock.h
+++ b/Stackspinblock.h
@@ -212,6 +212,7 @@ class StackSpinBlock
   void BuildSingleSlaterBlock(std::vector<int> sts);
   void set_loopblock(bool p_loopblock){loopblock = p_loopblock;}
   friend ostream& operator<< (ostream& os, const StackSpinBlock& b);
+  std::vector<boost::shared_ptr<StackSparseMatrix>> prebuild(int num_threads) const;
   void multiplyH(StackWavefunction& c, StackWavefunction* v, int num_threads) const;
   void multiplyH_2index(StackWavefunction& c, StackWavefunction* v, int num_threads) const;
   void multiplyOverlap(StackWavefunction& c, StackWavefunction* v, int num_threads) const;
diff --git a/blas_calls.h b/blas_calls.h
index 9809f70..c03f30b 100644
--- a/blas_calls.h
+++ b/blas_calls.h
@@ -33,47 +33,56 @@ Sandeep Sharma and Garnet K.-L. Chan
 #ifdef AIX
 extern "C"
 {
+#if !defined(MKL_DIRECT_CALL) && !defined(MKL_DIRECT_CALL_SEQ)
   void daxpy(int *ntot, double *coeff, double *copy_from, int *inc1,
 	     double *copy_to, int *inc2);
-  void dcopy(int *ntot,double *copy_from,int *inc1, double *copy_to,int *inc2);
-  double ddot(int *ntot, double *x, int *incx, double *y, int *incy);
   void dgemm(char *transa, char *transb, int *m, int *n, int *k, double *alpha,
 	     double *A, int *lda, double *B, int *ldb, double *beta, double *C,
 	     int *ldc);
-  void dgemv(char *trans, int *m, int *n, double *alpha,
-             double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
-  void dscal(int *size,double *coeff,double *matrix,int *inc);
-
+  double ddot(int *ntot, double *x, int *incx, double *y, int *incy);
   void saxpy(int *ntot, float *coeff, float *copy_from, int *inc1,
 	     float *copy_to, int *inc2);
-  void scopy(int *ntot, float *copy_from, int *inc1, float *copy_to, int *inc2);
-  float sdot(int *ntot, float *x, int *incx, float *y, int *incy);
   void sgemm(char *transa, char *transb, int *m, int *n, int *k, float *alpha,
 	     float *A, int *lda, float *B, int *ldb, float *beta, float *C,
 	     int *ldc);
+  float sdot(int *ntot, float *x, int *incx, float *y, int *incy);
+#endif
+
+  void dcopy(int *ntot,double *copy_from,int *inc1, double *copy_to,int *inc2);
+
+  void dgemv(char *trans, int *m, int *n, double *alpha,
+             double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
+  void dscal(int *size,double *coeff,double *matrix,int *inc);
+
+  void scopy(int *ntot, float *copy_from, int *inc1, float *copy_to, int *inc2);
   void sscal(int *size, float *coeff, float *matrix,int *inc);
 }
 #else //SGI, Linux
 extern "C"
 {
+#if !defined(MKL_DIRECT_CALL) && !defined(MKL_DIRECT_CALL_SEQ)
   void daxpy_(FORTINT *ntot, double *coeff, double *copy_from, FORTINT *inc1,
 	     double *copy_to, FORTINT *inc2);
-  void dcopy_(FORTINT *ntot,double *copy_from,FORTINT *inc1, double *copy_to,FORTINT *inc2);
   double ddot_(FORTINT *ntot, double *x, FORTINT *incx, double *y, FORTINT *incy);
   void dgemm_(char *transa, char *transb, FORTINT *m, FORTINT *n, FORTINT *k, double *alpha,
 	     double *A, FORTINT *lda, double *B, FORTINT *ldb, double *beta, double *C,
 	     FORTINT *ldc);
-  void dgemv_(char *trans, FORTINT *m, FORTINT *n, double *alpha,
-              double *a, FORTINT *lda, double *x, FORTINT *incx, double *beta, double *y, FORTINT *incy);
-  void dscal_(FORTINT *size, double *coeff, double *matrix,FORTINT *inc);
-
   void saxpy_(FORTINT *ntot, float *coeff, float *copy_from, FORTINT *inc1,
 	     float *copy_to, FORTINT *inc2);
-  void scopy_(FORTINT *ntot, float *copy_from, FORTINT *inc1, float *copy_to, FORTINT *inc2);
   float sdot_(FORTINT *ntot, float *x, FORTINT *incx, float *y, FORTINT *incy);
   void sgemm_(char *transa, char *transb, FORTINT *m, FORTINT *n, FORTINT *k, float *alpha,
 	     float *A, FORTINT *lda, float *B, FORTINT *ldb, float *beta, float *C,
 	     FORTINT *ldc);
+#endif
+
+  void dcopy_(FORTINT *ntot,double *copy_from,FORTINT *inc1, double *copy_to,FORTINT *inc2);
+
+  void dgemv_(char *trans, FORTINT *m, FORTINT *n, double *alpha,
+              double *a, FORTINT *lda, double *x, FORTINT *incx, double *beta, double *y, FORTINT *incy);
+  void dscal_(FORTINT *size, double *coeff, double *matrix,FORTINT *inc);
+
+  void scopy_(FORTINT *ntot, float *copy_from, FORTINT *inc1, float *copy_to, FORTINT *inc2);
+
   void sscal_(FORTINT *size, float *coeff, float *matrix,FORTINT *inc);
   int idamax_(FORTINT &n, double* d, FORTINT &indx);
   //int idamax_(int &n, double* d, int &indx);
@@ -86,26 +95,29 @@ extern "C"
 #ifdef AIX
 extern "C"
 {
-  void dgesvd(char* JOBU, char* JOBVT, int* M, int* N, double* A, int* LDA, double* S, double* U, int* LDU, double* VT, int* LDVT, double* WORK, int* LWORK,
-	 int* INFO);
+#if !defined(MKL_DIRECT_CALL) && !defined(MKL_DIRECT_CALL_SEQ)
   void daxpy(int *ntot, double *coeff, double *copy_from, int *inc1,
 	     double *copy_to, int *inc2);
-  void dcopy(int *ntot,double *copy_from,int *inc1, double *copy_to,int *inc2);
   double ddot(int *ntot, double *x, int *incx, double *y, int *incy);
   void dgemm(char *transa, char *transb, int *m, int *n, int *k, double *alpha,
 	     double *A, int *lda, double *B, int *ldb, double *beta, double *C,
 	     int *ldc);
-  void dgemv(char *trans, int *m, int *n, double *alpha,
-             double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
-  void dscal(int *size,double *coeff,double *matrix,int *inc);
-
   void saxpy(int *ntot, float *coeff, float *copy_from, int *inc1,
 	     float *copy_to, int *inc2);
-  void scopy(int *ntot, float *copy_from, int *inc1, float *copy_to, int *inc2);
   float sdot(int *ntot, float *x, int *incx, float *y, int *incy);
   void sgemm(char *transa, char *transb, int *m, int *n, int *k, float *alpha,
 	     float *A, int *lda, float *B, int *ldb, float *beta, float *C,
 	     int *ldc);
+#endif
+
+  void dgesvd(char* JOBU, char* JOBVT, int* M, int* N, double* A, int* LDA, double* S, double* U, int* LDU, double* VT, int* LDVT, double* WORK, int* LWORK,
+	 int* INFO);
+
+  void dcopy(int *ntot,double *copy_from,int *inc1, double *copy_to,int *inc2);
+  void dgemv(char *trans, int *m, int *n, double *alpha,
+             double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
+  void dscal(int *size,double *coeff,double *matrix,int *inc);
+  void scopy(int *ntot, float *copy_from, int *inc1, float *copy_to, int *inc2);
   void sscal(int *size, float *coeff, float *matrix,int *inc);
   void dstev(char* JOBZ,FORTINT* N,double* A,double* E, double* W, FORTINT* Wlen, double*WORK, FORTINT* INFO);
   void dsyev(char* JOBZ,char* UPLO,int* N,double* A,int* LDA,double* W,double*WORK,int*LWORK,int* INFO);
@@ -114,26 +126,30 @@ extern "C"
 #else //SGI, Linux
 extern "C"
 {
-  void dgesvd_(char* JOBU, char* JOBVT, FORTINT* M, FORTINT* N, double* A, FORTINT* LDA, double* S, double* U, FORTINT* LDU, double* VT, FORTINT* LDVT, double* WORK, FORTINT* LWORK,
-	 FORTINT* INFO);
+#if !defined(MKL_DIRECT_CALL) && !defined(MKL_DIRECT_CALL_SEQ)
   void daxpy_(FORTINT *ntot, double *coeff, double *copy_from, FORTINT *inc1,
 	     double *copy_to, FORTINT *inc2);
-  void dcopy_(FORTINT *ntot,double *copy_from,FORTINT *inc1, double *copy_to,FORTINT *inc2);
   double ddot_(FORTINT *ntot, double *x, FORTINT *incx, double *y, FORTINT *incy);
   void dgemm_(char *transa, char *transb, FORTINT *m, FORTINT *n, FORTINT *k, double *alpha,
 	     double *A, FORTINT *lda, double *B, FORTINT *ldb, double *beta, double *C,
 	     FORTINT *ldc);
-  void dgemv_(char *trans, FORTINT *m, FORTINT *n, double *alpha,
-              double *a, FORTINT *lda, double *x, FORTINT *incx, double *beta, double *y, FORTINT *incy);
-  void dscal_(FORTINT *size, double *coeff, double *matrix,FORTINT *inc);
-
   void saxpy_(FORTINT *ntot, float *coeff, float *copy_from, FORTINT *inc1,
 	     float *copy_to, FORTINT *inc2);
-  void scopy_(FORTINT *ntot, float *copy_from, FORTINT *inc1, float *copy_to, FORTINT *inc2);
   float sdot_(FORTINT *ntot, float *x, FORTINT *incx, float *y, FORTINT *incy);
   void sgemm_(char *transa, char *transb, FORTINT *m, FORTINT *n, FORTINT *k, float *alpha,
 	     float *A, FORTINT *lda, float *B, FORTINT *ldb, float *beta, float *C,
 	     FORTINT *ldc);
+#endif
+
+  void dgesvd_(char* JOBU, char* JOBVT, FORTINT* M, FORTINT* N, double* A, FORTINT* LDA, double* S, double* U, FORTINT* LDU, double* VT, FORTINT* LDVT, double* WORK, FORTINT* LWORK,
+	 FORTINT* INFO);
+
+  void dcopy_(FORTINT *ntot,double *copy_from,FORTINT *inc1, double *copy_to,FORTINT *inc2);
+  void dgemv_(char *trans, FORTINT *m, FORTINT *n, double *alpha,
+              double *a, FORTINT *lda, double *x, FORTINT *incx, double *beta, double *y, FORTINT *incy);
+  void dscal_(FORTINT *size, double *coeff, double *matrix,FORTINT *inc);
+
+  void scopy_(FORTINT *ntot, float *copy_from, FORTINT *inc1, float *copy_to, FORTINT *inc2);
   void sscal_(FORTINT *size, float *coeff, float *matrix,FORTINT *inc);
   void dstev_(char* JOBZ,FORTINT* N,double* A,double* E, double* W, FORTINT* Wlen, double*WORK, FORTINT* INFO);
   void dsyev_(char* JOBZ,char* UPLO,FORTINT* N,double* A,FORTINT* LDA,double* W,double*WORK,FORTINT*LWORK,FORTINT* INFO);
@@ -153,6 +169,10 @@ extern "C"
 ** int ntot: length of data
 ** int inc1,inc2: increments for copy_from, copy_to
 */
+#if defined(MKL_DIRECT_CALL) || defined(MKL_DIRECT_CALL_SEQ)
+#undef DAXPY
+#undef SAXPY
+#endif
 inline void DAXPY(FORTINT ntot, double coeff, double *copy_from, FORTINT inc1,
 	   double *copy_to, FORTINT inc2)
 {
@@ -171,7 +191,6 @@ inline void SAXPY(FORTINT ntot, float coeff, float *copy_from, FORTINT inc1,
   saxpy_(&ntot,&coeff,copy_from,&inc1,copy_to,&inc2);
 #endif
 }
-
 /*
 ** 
 ** void DCOPY(int ntot, double *x, int incx, double *y, int *incy);
@@ -208,6 +227,10 @@ inline void SCOPY(FORTINT ntot,float *copy_from,FORTINT inc1,float *copy_to,FORT
 ** int ntot: length of x,y
 ** int incx,incy: increments for x,y 
 */
+#if defined(MKL_DIRECT_CALL) || defined(MKL_DIRECT_CALL_SEQ)
+#undef DDOT
+#undef SDOT
+#endif
 inline double DDOT(FORTINT ntot, double *x, FORTINT incx, double *y, FORTINT incy)
 {
 #ifdef AIX
@@ -224,7 +247,6 @@ inline float SDOT(FORTINT ntot, float *x, FORTINT incx, float *y, FORTINT incy)
   return sdot_(&ntot,x,&incx,y,&incy);
 #endif
 }
-
 /*
 ** 
 ** void DGEMM(char transa, char transb, int m, int n, int k, double alpha,
@@ -283,6 +305,9 @@ inline float SDOT(FORTINT ntot, float *x, FORTINT incx, float *y, FORTINT incy)
 **                    On exit, ldc is unchanged.
 **
 */
+#if defined(MKL_DIRECT_CALL) || defined(MKL_DIRECT_CALL_SEQ)
+#undef DGEMM
+#endif
 inline void DGEMM(char transa, char transb, FORTINT m, FORTINT n, FORTINT k, double alpha,
 	   double *A, FORTINT lda, double *B, FORTINT ldb, double beta, double *C,
 	   FORTINT ldc)
@@ -331,7 +356,9 @@ inline void GESV(FORTINT n, FORTINT nrhs, double* a, FORTINT lda, FORTINT* ipiv,
 #endif
 }
 
-
+#if defined(MKL_DIRECT_CALL) || defined(MKL_DIRECT_CALL_SEQ)
+#undef SGEMM
+#endif
 inline void SGEMM(char transa, char transb, FORTINT m, FORTINT n, FORTINT k, float alpha,
 	   float *A, FORTINT lda, float *B, FORTINT ldb, float beta, float *C,
 	   FORTINT ldc)
diff --git a/davidson.C b/davidson.C
index f7af55c..abb0853 100644
--- a/davidson.C
+++ b/davidson.C
@@ -12,33 +12,40 @@ Sandeep Sharma and Garnet K.-L. Chan
 #endif
 #include "Stackwavefunction.h"
 #include "Stackspinblock.h"
-#include "StackBaseOperator.h"
 #include "MatrixBLAS.h"
 
-
-SpinAdapted::multiply_h::multiply_h(const StackSpinBlock& b, const bool &onedot_) : block(b){}
-
+SpinAdapted::multiply_h::multiply_h(const StackSpinBlock& b, const bool &onedot_) : block(b){
+  if (dmrginp.prebuild() && dmrginp.doimplicitTranspose())
+    twoidxcomps = b.prebuild(MAX_THRD);
+}
 
 void SpinAdapted::multiply_h::operator()(StackWavefunction& c, StackWavefunction& v)
 {
   block.multiplyH( c, &v, MAX_THRD);
-  //block.multiplyH_2index( c, &v, MAX_THRD);
-  //pout << "c" << endl;
-  //pout << c << endl;
-  //pout << "v" << endl;
-  //pout << v << endl;
 }
 
-SpinAdapted::multiply_h_2Index::multiply_h_2Index(const StackSpinBlock& b, const bool &onedot_) : block(b){}
+SpinAdapted::multiply_h::~multiply_h() {
+  if (dmrginp.prebuild() && dmrginp.doimplicitTranspose()) {
+    for (int i = twoidxcomps.size()-1; i >= 0; --i)
+      twoidxcomps.at(i)->deallocate();
+  }
+}
+
+SpinAdapted::multiply_h_2Index::multiply_h_2Index(const StackSpinBlock& b, const bool &onedot_) : block(b){
+  if (dmrginp.prebuild() && dmrginp.doimplicitTranspose())
+    twoidxcomps = b.prebuild(MAX_THRD);
+}
 
 
 void SpinAdapted::multiply_h_2Index::operator()(StackWavefunction& c, StackWavefunction& v)
 {
   block.multiplyH_2index( c, &v, MAX_THRD);
-  //block.multiplyH3index( c, &v, MAX_THRD);
 }
 
 
-
-
-
+SpinAdapted::multiply_h_2Index::~multiply_h_2Index() {
+  if (dmrginp.prebuild() && dmrginp.doimplicitTranspose()) {
+    for (int i = twoidxcomps.size()-1; i >= 0; --i)
+      twoidxcomps.at(i)->deallocate();
+  }
+}
diff --git a/davidson.h b/davidson.h
index d34b403..adc5e95 100644
--- a/davidson.h
+++ b/davidson.h
@@ -8,6 +8,8 @@ Sandeep Sharma and Garnet K.-L. Chan
 
 #ifndef SPIN_DAVIDSON_HEADER
 #define SPIN_DAVIDSON_HEADER
+#include <boost/shared_ptr.hpp>
+#include "StackBaseOperator.h"
 
 namespace SpinAdapted{
   class StackWavefunction;
@@ -17,26 +19,31 @@ struct Davidson_functor
 {
   virtual void operator()(StackWavefunction& c, StackWavefunction& v) = 0;
   virtual const StackSpinBlock& get_block() = 0;
+  virtual ~Davidson_functor() {}
 };
 
 class multiply_h : public Davidson_functor
 {
  private:
   const StackSpinBlock& block;
+  std::vector<boost::shared_ptr<StackSparseMatrix>> twoidxcomps;
  public:
   multiply_h(const StackSpinBlock& b, const bool &onedot_);
   void operator()(StackWavefunction& c, StackWavefunction& v);
   const StackSpinBlock& get_block() {return block;}
+  ~multiply_h();
 };
 
 class multiply_h_2Index : public Davidson_functor
 {
  private:
   const StackSpinBlock& block;
+  std::vector<boost::shared_ptr<StackSparseMatrix>> twoidxcomps;
  public:
   multiply_h_2Index(const StackSpinBlock& b, const bool &onedot_);
   void operator()(StackWavefunction& c, StackWavefunction& v);
   const StackSpinBlock& get_block() {return block;}
+  ~multiply_h_2Index();
 };
 
 }
diff --git a/dmrg.C b/dmrg.C
index 6e4334e..204539f 100644
--- a/dmrg.C
+++ b/dmrg.C
@@ -200,7 +200,7 @@ int calldmrg(char* input, char* output)
 
   cpu_set_t  *oldmask, newmask;
   int oldset, newset;
-  //if only a subset of processors are being used, then unset processor affinity
+  // if only a subset of processors are being used, then unset processor affinity
   if (m_calc_procs.size() < world.size()) {
     sched_getaffinity(0, oldset, oldmask);
     CPU_ZERO( &newmask);
@@ -293,20 +293,17 @@ int calldmrg(char* input, char* output)
       dmrginp.set_algorithm_method() = ONEDOT;
       //initialize state info and canonicalize wavefunction is always done using onedot algorithm
       if (mpigetrank()==0) {
-	Sweep::InitializeStateInfo(sweepParams, direction, baseState);
-	Sweep::InitializeStateInfo(sweepParams, !direction, baseState);
-	Sweep::CanonicalizeWavefunction(sweepParams, direction, baseState);
-	Sweep::CanonicalizeWavefunction(sweepParams, !direction, baseState);
-	Sweep::CanonicalizeWavefunction(sweepParams, direction, baseState);
+        Sweep::InitializeStateInfo(sweepParams, direction, baseState);
+        Sweep::InitializeStateInfo(sweepParams, !direction, baseState);
+        Sweep::CanonicalizeWavefunction(sweepParams, direction, baseState);
+        Sweep::CanonicalizeWavefunction(sweepParams, !direction, baseState);
+        Sweep::CanonicalizeWavefunction(sweepParams, direction, baseState);
       }
       dmrginp.set_algorithm_method() = atype;
     }
-
     //this genblock is required to generate all the nontranspose operators
     dmrginp.setimplicitTranspose() = false;
     SweepGenblock::do_one(sweepParams, false, false, false, restartsize, baseState, baseState);
-
-
     compress(sweep_tol, targetState, baseState);
   }
   else if (dmrginp.calc_type() == RESPONSEBW)
@@ -490,6 +487,9 @@ int calldmrg(char* input, char* output)
   else if (dmrginp.calc_type() == TINYCALC) {
     Sweep::tiny(sweep_tol);
   }
+  else if (dmrginp.calc_type() == RESTART_ONEPDM) {
+    Npdm::npdm(NPDM_ONEPDM);
+  }
   else if (dmrginp.calc_type() == RESTART_THREEPDM) {
     Npdm::npdm(NPDM_THREEPDM);
   }
@@ -572,8 +572,7 @@ int calldmrg(char* input, char* output)
 #ifndef SERIAL
   }
 
-  //world.barrier();
-  sleepBarrier(world, 0, 10);
+  sleepBarrier(world, 0, 1);
   MPI_Comm_free(&Calc);
   sched_setaffinity(0, sizeof(oldmask), oldmask);
 #endif
@@ -1136,7 +1135,11 @@ void responsepartialSweep(double sweep_tol, int targetState, vector<int>& projec
 	    }
 	w2.CollectQuantaAlongColumns(*state.leftStateInfo, const_cast<StateInfo&>(*newSystem.get_stateInfo().unCollectedStateInfo));
 	StateInfo bigstate;
-	TensorProduct(*state.leftStateInfo, const_cast<StateInfo&>(newSystem.get_stateInfo()), bigstate,  PARTICLE_SPIN_NUMBER_CONSTRAINT);      
+  if (dmrginp.hamiltonian() == BCS) {
+	  TensorProduct(*state.leftStateInfo, const_cast<StateInfo&>(newSystem.get_stateInfo()), bigstate,  SPIN_NUMBER_CONSTRAINT);
+  } else {
+	  TensorProduct(*state.leftStateInfo, const_cast<StateInfo&>(newSystem.get_stateInfo()), bigstate,  PARTICLE_SPIN_NUMBER_CONSTRAINT);
+  }
 	w2.SaveWavefunctionInfo(bigstate, sites, baseStates[0]);
 	w2.deallocate();
 	w.deallocate();
@@ -1374,7 +1377,6 @@ void compress(double sweep_tol, int targetState, int baseState)
   sweepParams.current_root() = -1;
   //this is the warmup sweep, the baseState is used as the initial guess for the targetState
   last_fe = SweepCompress::do_one(sweepParams, true, true, false, 0, targetState, baseState);
-
   direction = false;
   while ( true)
     {
diff --git a/enumerator.h b/enumerator.h
index 86e9b9f..5a866f3 100644
--- a/enumerator.h
+++ b/enumerator.h
@@ -34,7 +34,7 @@ namespace SpinAdapted{
   enum keywords{ORBS, LASTM, STARTM, MAXM,  REORDER, HF_OCC, SCHEDULE, SYM, NELECS, SPIN, IRREP,
 		MAXJ, PREFIX, NROOTS, DOCD, DEFLATION_MAX_SIZE, MAXITER, BASENERGY,
 		SCREEN_TOL, ODOT, SWEEP_TOL, OUTPUTLEVEL, NONSPINADAPTED, BOGOLIUBOV, TWODOT_NOISE, WARMUP, 
-		NPDM_INTERMEDIATE, NPDM_NO_INTERMEDIATE, NPDM_MULTINODE, NPDM_NO_MULTINODE, NUMKEYWORDS};
+		NPDM_INTERMEDIATE, NPDM_NO_INTERMEDIATE, NPDM_MULTINODE, NPDM_NO_MULTINODE, PREBUILD, NUMKEYWORDS};
   
   enum {
     NO_PARTICLE_SPIN_NUMBER_CONSTRAINT,
diff --git a/fciqmchelper.C b/fciqmchelper.C
index f31b6ad..606c670 100644
--- a/fciqmchelper.C
+++ b/fciqmchelper.C
@@ -4,6 +4,7 @@
 #include "Stackspinblock.h"
 #include "initblocks.h"
 #ifndef SERIAL
+#include "mpi.h"
 #include <boost/mpi.hpp>
 #endif
 
@@ -172,14 +173,14 @@ namespace SpinAdapted{
   //this is a helper function to make a MPS from a occupation number representation of determinant
   //this representation is slightly different than the usual occupation, here each integer
   //element is a spatial orbital which can have a value 0, -1, 1, or 2.
-  MPS::MPS(ulong *occnum, int length)
+  MPS::MPS(unsigned long *occnum, int length)
   {
     assert(length*64 >= dmrginp.last_site());
     
     //convert the int array into a vector<bool>
     std::vector<bool> occ(dmrginp.last_site(), 0);
     
-    ulong temp = 1;
+    unsigned long temp = 1;
     int index = 0;
     for (int i=0; i <length ; i++) {
       long occtemp = occnum[i];
@@ -299,7 +300,11 @@ namespace SpinAdapted{
 
       }
       StateInfo bigState;
+      if (dmrginp.hamiltonian() == BCS) {
+      TensorProduct(currentState, const_cast<StateInfo&>(MPS::siteBlocks[MPS::sweepIters+1].get_stateInfo()), bigState, SPIN_NUMBER_CONSTRAINT);
+      } else {
       TensorProduct(currentState, const_cast<StateInfo&>(MPS::siteBlocks[MPS::sweepIters+1].get_stateInfo()), bigState, PARTICLE_SPIN_NUMBER_CONSTRAINT);
+      }
 
       if (!dmrginp.spinAdapted())
 	rotSites[1] -= 2;
@@ -393,25 +398,26 @@ namespace SpinAdapted{
     InitBlocks::InitStartingBlock(system, forward, leftState, statebindex, forward_starting_size, backward_starting_size, restartSize, restart, warmUp, integralIndex); 
     SpinQuantum hq(0, SpinSpace(0), IrrepSpace(0));
 
-    p2out << system<<endl;
-
     std::vector<Matrix> Rotationa, Rotationb;
     int sys_add = true; bool direct = true; 
 
     std::vector<int> rotSites(2,0);
     int sweepIters = dmrginp.spinAdapted() ? dmrginp.last_site() -2 : dmrginp.last_site()/2-2;
+    int normToComp = sweepIters/2;
+
     for (int i=0; i<sweepIters-1; i++) {
-      pout << i<<" out of "<<sweepIters-1<<endl;
       StackSpinBlock newSystem;
+      if (i>=normToComp && !system.has(CRE_DESCOMP))
+        system.addAllCompOps();
       system.addAdditionalOps();
 
       StackSpinBlock dotsite(i+1, i+1, integralIndex, false);
       if (mpigetrank() == 0) {
-	rotSites[1] = i+1;
-	LoadRotationMatrix(rotSites, Rotationa, statea);
-	LoadRotationMatrix(rotSites, Rotationb, stateb);
-	//Rotationa = statea.getSiteTensors(i+1);
-	//Rotationb = stateb.getSiteTensors(i+1);
+        rotSites[1] = dmrginp.spinAdapted() ? i+1 : i*2+3;
+        LoadRotationMatrix(rotSites, Rotationa, statea);
+        LoadRotationMatrix(rotSites, Rotationb, stateb);
+        //Rotationa = statea.getSiteTensors(i+1);
+        //Rotationb = stateb.getSiteTensors(i+1);
       }
 
 #ifndef SERIAL
@@ -419,7 +425,11 @@ namespace SpinAdapted{
       mpi::broadcast(calc, Rotationa, 0);
       mpi::broadcast(calc, Rotationb, 0);
 #endif
-      InitBlocks::InitNewSystemBlock(system, dotsite, newSystem, 0, statebindex, sys_add, direct, integralIndex, DISTRIBUTED_STORAGE, false, true);
+      if (i < normToComp) {
+        InitBlocks::InitNewSystemBlock(system, dotsite, newSystem, 0, statebindex, sys_add, direct, integralIndex, DISTRIBUTED_STORAGE, true, false);
+      } else {
+        InitBlocks::InitNewSystemBlock(system, dotsite, newSystem, 0, statebindex, sys_add, direct, integralIndex, DISTRIBUTED_STORAGE, false, true);
+      }
 
       newSystem.transform_operators(const_cast<std::vector<Matrix>&>(Rotationa), 
 				    const_cast<std::vector<Matrix>&>(Rotationb));
@@ -433,32 +443,47 @@ namespace SpinAdapted{
     }
 
     StackSpinBlock newSystem, big;
-    StackSpinBlock dotsite1(sweepIters, sweepIters, integralIndex, false);
-    StackSpinBlock dotsite2(sweepIters+1, sweepIters+1, integralIndex, false);
-
+    StackSpinBlock dotsite1(sweepIters, sweepIters, integralIndex, sameStates);
+    StackSpinBlock dotsite2(sweepIters+1, sweepIters+1, integralIndex, sameStates);
 
     system.addAdditionalOps();
     InitBlocks::InitNewSystemBlock(system, dotsite1, newSystem, 0, statebindex, sys_add, direct, integralIndex, DISTRIBUTED_STORAGE, false, true);
     
     newSystem.set_loopblock(false); system.set_loopblock(false);
-    InitBlocks::InitBigBlock(newSystem, dotsite2, big); 
+    InitBlocks::InitBigBlock(newSystem, dotsite2, big);
     
     StackWavefunction stateaw, statebw;
     StateInfo s;
 
-    rotSites[1] += 1;
+    rotSites[1] += dmrginp.spinAdapted() ? 1 : 2;
     stateaw.LoadWavefunctionInfo(s, rotSites, statea, true);
     statebw.LoadWavefunctionInfo(s, rotSites, stateb, true);
 
+#ifndef SERIAL
+    mpi::broadcast(calc, stateaw, 0);
+    mpi::broadcast(calc, statebw, 0);
+    if (mpigetrank() != 0) {
+      double* dataa = Stackmem[omprank].allocate(stateaw.memoryUsed());
+      stateaw.set_data(dataa);
+      stateaw.allocateOperatorMatrix();
+      double* datab = Stackmem[omprank].allocate(statebw.memoryUsed());
+      statebw.set_data(datab);
+      statebw.allocateOperatorMatrix();
+    }
+    calc.barrier();
+    MPI_Bcast(stateaw.get_data(), stateaw.memoryUsed(), MPI_DOUBLE, 0, Calc);
+    MPI_Bcast(statebw.get_data(), statebw.memoryUsed(), MPI_DOUBLE, 0, Calc);
+#endif
+
     StackWavefunction temp; temp.initialise(stateaw);
     temp.Clear();
-    
-
+    calc.barrier();
     big.multiplyH_2index(statebw, &temp, 1);
-
+    calc.barrier();
     if (mpigetrank() == 0)
       h = DotProduct(stateaw, temp);
 
+    calc.barrier();
     temp.Clear();
 
     big.multiplyOverlap(statebw, &temp, 1);
@@ -466,7 +491,6 @@ namespace SpinAdapted{
       o = DotProduct(stateaw, temp);
 
 #ifndef SERIAL
-      mpi::communicator world;
     mpi::broadcast(calc, h, 0);
     mpi::broadcast(calc, o, 0);
 #endif
diff --git a/fciqmchelper.h b/fciqmchelper.h
index 969566b..fba7041 100644
--- a/fciqmchelper.h
+++ b/fciqmchelper.h
@@ -39,7 +39,7 @@ class MPS{
   MPS() {};
   MPS(int stateindex); 
   MPS(std::vector<bool>& occ);
-  MPS(ulong* occnum, int length);
+  MPS(unsigned long* occnum, int length);
   //void buildMPSrep();
   std::vector<Matrix>& getSiteTensors(int i) {return SiteTensors[i];}
   const std::vector<Matrix>& getSiteTensors(int i) const {return SiteTensors[i];}
diff --git a/include/multiarray.h b/include/multiarray.h
index e069c26..061b685 100644
--- a/include/multiarray.h
+++ b/include/multiarray.h
@@ -12,6 +12,7 @@ Sandeep Sharma and Garnet K.-L. Chan
 #include <vector>
 #include <string>
 #include <cassert>
+#include <iostream>
 #include <boost/serialization/base_object.hpp>
 #include <boost/serialization/serialization.hpp>
 
diff --git a/input.C b/input.C
index 4ed5e7b..355cb6b 100644
--- a/input.C
+++ b/input.C
@@ -130,6 +130,7 @@ void SpinAdapted::Input::initialize_defaults()
   m_guess_permutations = 10;
 
   m_direct = true;
+  m_prebuild = false;
   m_nroots = 1;
   m_weights.resize(m_nroots);
   m_weights[0] = 1.;
@@ -184,6 +185,7 @@ void SpinAdapted::Input::initialize_defaults()
   m_maxM = 0;
   m_lastM = 500;
   m_startM = 250;
+  m_bra_M = 0;
   
   m_calc_ri_4pdm=false;
   m_store_ripdm_readable=false;
@@ -303,8 +305,17 @@ SpinAdapted::Input::Input(const string& config_name) {
 	}
     m_Bogoliubov = true;
     m_ham_type = BCS;
-      }
-      else if (boost::iequals(keyword, "warmup")) {
+      } else if (boost::iequals(keyword, "prebuild")) {
+        if (usedkey[PREBUILD] == 0)
+          usedkey_error(keyword, msg);
+        usedkey[PREBUILD] = 0;
+        if (tok.size() != 1) {
+          pout << "keyword prebuild is a stand alone keyword" << endl;
+          pout << msg << endl;
+          abort();
+        }
+        m_prebuild = true;
+      } else if (boost::iequals(keyword, "warmup")) {
         if (usedkey[WARMUP] == 0)
           usedkey_error(keyword, msg);
         usedkey[WARMUP] = 0;
@@ -967,9 +978,18 @@ SpinAdapted::Input::Input(const string& config_name) {
       {
         m_npdm_multinode = false;
       }
-
-
-
+      else if (boost::iequals(keyword, "specificpdm"))
+      {
+        if(tok.size() == 2) {
+          m_specificpdm.push_back(atoi(tok[1].c_str()));
+        } else if (tok.size() == 3) {
+          m_specificpdm.push_back(atoi(tok[1].c_str()));
+          m_specificpdm.push_back(atoi(tok[2].c_str()));
+        } else {
+          pout << "keyword "<<keyword<<" should be followed by one or two numbers and then an endline";
+          abort();
+        }
+      }
       else if(boost::iequals(keyword,  "deflation_max_size") || boost::iequals(keyword,  "max_deflation_size"))
       {
 	if (tok.size() !=  2) {
@@ -1175,7 +1195,17 @@ SpinAdapted::Input::Input(const string& config_name) {
       else if (boost::iequals(keyword,  "reset_iterations") || boost::iequals(keyword,  "reset_iter") || boost::iequals(keyword,  "reset_iters")) {
 	m_reset_iterations = true;
       }
-
+      else if (boost::iequals(keyword,  "bra_M")) {
+        //The number of renormalized bases in bra wave function.
+        //Currently only used in transition denstiy matrix calcualtions.
+        if (tok.size() != 2) {
+          pout << "keyword bra_M should be followed by a single integer and then an endline"<<endl;
+          pout << "error found in the following line "<<endl;
+          pout << msg<<endl;
+          abort();
+        }
+        m_bra_M = atoi(tok[1].c_str());
+      }
       else
       {
         pout << "Unrecognized option :: " << keyword << endl;
@@ -1246,6 +1276,7 @@ SpinAdapted::Input::Input(const string& config_name) {
   mpi::broadcast(world, m_calc_type, 0);
   mpi::broadcast(world, m_calc_procs, 0);
   mpi::broadcast(world, m_baseState, 0);
+  mpi::broadcast(world, m_useSharedMemory, 0);
 #endif
 
   //make the scratch files
@@ -1281,11 +1312,6 @@ SpinAdapted::Input::Input(const string& config_name) {
     abort();
   }
 
-  if (m_Bogoliubov && m_num_Integrals >1 ) {
-    pout << "Currently the response code does not work with non-particle number conserving hamiltonians!!";
-    abort();
-  }
-    
 //if (sym != "c1") // must be initialized even if c1 sym.
     Symmetry::InitialiseTable(sym);
 
@@ -1311,7 +1337,7 @@ SpinAdapted::Input::Input(const string& config_name) {
       v_cccc[integral].rhf = true;
       v_cccd[integral].rhf = true;
     }
-    
+
     // Kij-based ordering by GA opt.
 #ifndef SERIAL
     mpi::broadcast(world,m_reorderType,0);
@@ -1376,7 +1402,7 @@ SpinAdapted::Input::Input(const string& config_name) {
 	}
       }
       else
-	readorbitalsfile(orbitalfile[integral], v_1[integral], v_2[integral], coreEnergy[integral], integral);
+        readorbitalsfile(orbitalfile[integral], v_1[integral], v_2[integral], coreEnergy[integral], integral);
     }
   }
   
@@ -1445,30 +1471,35 @@ SpinAdapted::Input::Input(const string& config_name) {
   mpi::broadcast(world, PROPBITLEN, 0);
 #endif
   //if (mpigetrank() == 0) {
+  //  int s = 0;
   //cout << "v_1" << endl;
   //for (int i = 0; i < m_norbs; ++i)
   //  for (int j = 0; j < m_norbs; ++j)
-  //    cout << fixed << setprecision(12) << v_1[0](i,j) << " " << i << " " << j << endl;
+  //    if (fabs(v_1[s](i,j)) > 1e-12)
+  //      cout << fixed << setprecision(12) << v_1[s](i,j) << " " << i << " " << j << endl;
 
   //cout << "v_2" << endl;
   //for (int i = 0; i < m_norbs; ++i)
   //  for (int j = 0; j < m_norbs; ++j)
   //    for (int k = 0; k < m_norbs; ++k)
   //      for (int l = 0; l < m_norbs; ++l)
-  //        cout << fixed << setprecision(12) << v_2[0](i,j,k,l)  
+  //        if (fabs(v_2[s](i,j,k,l)) > 1e-12)
+  //        cout << fixed << setprecision(12) << v_2[s](i,j,k,l)
   //          << " " << i << " " << j << " " << k << " " << l << endl;
 
   //cout << "v_cc" << endl;
   //for (int i = 0; i < m_norbs; ++i)
   //  for (int j = 0; j < m_norbs; ++j)
-  //    cout << fixed << setprecision(12) << v_cc[0](i,j) << " " << i << " " << j << endl;
+  //    if (fabs(v_cc[s](i,j)) > 1e-12)
+  //    cout << fixed << setprecision(12) << v_cc[s](i,j) << " " << i << " " << j << endl;
 
   //cout << "v_cccc" << endl;
   //for (int i = 0; i < m_norbs; ++i)
   //  for (int j = 0; j < m_norbs; ++j)
   //    for (int k = 0; k < m_norbs; ++k)
   //      for (int l = 0; l < m_norbs; ++l)
-  //        cout << fixed << setprecision(12) << v_cccc[0](i,j,k,l)  
+  //        if (fabs(v_cccc[s](i,j,k,l)) > 1e-12)
+  //        cout << fixed << setprecision(12) << v_cccc[s](i,j,k,l)
   //          << " " << i << " " << j << " " << k << " " << l << endl;
 
   //cout << "v_cccd" << endl;
@@ -1476,11 +1507,11 @@ SpinAdapted::Input::Input(const string& config_name) {
   //  for (int j = 0; j < m_norbs; ++j)
   //    for (int k = 0; k < m_norbs; ++k)
   //      for (int l = 0; l < m_norbs; ++l)
-  //        cout << fixed << setprecision(12) << v_cccd[0](i,j,k,l)  
+  //        if (fabs(v_cccd[s](i,j,k,l))>1e-12)
+  //        cout << fixed << setprecision(12) << v_cccd[s](i,j,k,l)
   //          << " " << i << " " << j << " " << k << " " << l << endl;
   //}
   //world.barrier();
-  //exit(0);
 }
 
 void SpinAdapted::Input::readreorderfile(ifstream& dumpFile, std::vector<int>& oldtonew) {
@@ -1515,7 +1546,6 @@ void SpinAdapted::Input::readreorderfile(ifstream& dumpFile, std::vector<int>& o
     pout << "Numbers or orbitals in reorder file should be equal to "<<m_norbs/2<<" instead "<<oldtonew.size()<<" found "<<endl;
     abort();
   }
-
 }
 
 void SpinAdapted::Input::readorbitalsfile(string& orbitalfile, OneElectronArray& v1, TwoElectronArray& v2, double& coreEnergy, int integralIndex)
@@ -1650,7 +1680,7 @@ void SpinAdapted::Input::readorbitalsfile(string& orbitalfile, OneElectronArray&
   // but for this m_reorder the reorder vector below would be 3 1 2 4 and O_unreordered(1,2) -> O_reorder(3, 1)
   bool RHF = true;
   int AOrbOffset = 0, BOrbOffset = 0;
-  
+
   if (rank == 0) {
     reorder.resize(m_norbs/2);
     for (int i=0; i<m_norbs/2; i++) {
@@ -1800,7 +1830,7 @@ void SpinAdapted::Input::readorbitalsfile(string& orbitalfile, OneElectronArray&
       }
       value = atof(tok[0].c_str());
       i = atoi(tok[1].c_str())-offset;j = atoi(tok[2].c_str())-offset;k = atoi(tok[3].c_str())-offset;l = atoi(tok[4].c_str())-offset;
-      
+
       if (i==-1 && j==-1 && k==-1 && l==-1) {
 	coreEnergy = value;
 	if (AOrbOffset == 0 && BOrbOffset == 0) //AA
@@ -1890,7 +1920,7 @@ void SpinAdapted::Input::readorbitalsfile(string& orbitalfile, OneElectronArray&
 
     sprintf(ReorderFileName, "%s%s", save_prefix().c_str(), "/RestartReorder.dat");
   }
-  boost::filesystem::path p(ReorderFileName);
+  //boost::filesystem::path p(ReorderFileName);
 
 #ifndef SERIAL
   mpi::broadcast(world,m_norbs,0);
diff --git a/input.h b/input.h
index e90b554..0ea71a2 100644
--- a/input.h
+++ b/input.h
@@ -87,6 +87,7 @@ class Input {
   int m_lastM;
   int m_startM;
   int m_maxM;
+  int m_bra_M;
   int m_integral_disk_storage_thresh;
   int m_num_Integrals;
 
@@ -114,6 +115,7 @@ class Input {
   bool m_pdm_unsorted;
   bool m_store_nonredundant_pdm;
   bool m_npdm_intermediate;
+  std::vector<int> m_specificpdm;
   bool m_npdm_multinode;
   bool m_set_Sz;
   int m_maxiter;
@@ -135,6 +137,7 @@ class Input {
   std::string  m_save_prefix;
   std::string m_load_prefix;
   bool m_direct;
+  bool m_prebuild;
   std::vector<double> m_orbenergies;
 
   int m_maxj;
@@ -185,16 +188,16 @@ class Input {
     ar & m_sweep_iter_schedule & m_sweep_state_schedule & m_sweep_qstate_schedule & m_sweep_tol_schedule & m_sweep_noise_schedule &m_sweep_additional_noise_schedule & m_reorder;
     ar & m_molecule_quantum & m_total_symmetry_number & m_total_spin & m_orbenergies & m_add_noninteracting_orbs;
     ar & m_bra_symmetry_number & m_permSymm & m_activeorbs & m_excitation & m_openorbs & m_closedorbs;
-    ar & m_save_prefix & m_load_prefix & m_direct & m_max_lanczos_dimension &  m_performResponseSolution;
+    ar & m_save_prefix & m_load_prefix & m_direct & m_prebuild & m_max_lanczos_dimension &  m_performResponseSolution;
     ar & m_deflation_min_size & m_deflation_max_size & m_outputlevel & m_reorderfile;
     ar & m_algorithm_type & m_twodot_to_onedot_iter & m_orbformat & m_calc_procs;
     ar & m_nquanta & m_sys_add & m_env_add & m_do_fci & m_no_transform ;
-    ar & m_do_pdm & m_do_npdm_ops & m_do_npdm_in_core & m_npdm_generate & m_new_npdm_code  & m_transition_diff_spatial_irrep & m_occupied_orbitals;
+    ar & m_do_pdm & m_do_npdm_ops & m_do_npdm_in_core & m_npdm_generate & m_new_npdm_code & m_specificpdm & m_transition_diff_spatial_irrep & m_occupied_orbitals;
     ar & m_store_spinpdm &m_spatpdm_disk_dump & m_pdm_unsorted & m_npdm_intermediate & m_npdm_multinode;
     ar & m_maxj & m_ninej & m_maxiter & m_do_deriv & m_oneindex_screen_tol & m_twoindex_screen_tol & m_quantaToKeep & m_noise_type;
     ar & m_sweep_tol & m_restart & m_backward & m_fullrestart & m_restart_warm & m_reset_iterations & m_calc_type & m_ham_type & m_warmup;
     ar & m_do_diis & m_diis_error & m_start_diis_iter & m_diis_keep_states & m_diis_error_tol & m_num_spatial_orbs;
-    ar & m_spatial_to_spin & m_spin_to_spatial & m_maxM & m_schedule_type_backward & m_schedule_type_default &m_integral_disk_storage_thresh;
+    ar & m_spatial_to_spin & m_spin_to_spatial & m_maxM & m_bra_M & m_schedule_type_backward & m_schedule_type_default &m_integral_disk_storage_thresh;
     ar & n_twodot_noise & m_twodot_noise & m_twodot_gamma & m_guessState & m_useSharedMemory ;
     ar & m_calc_ri_4pdm & m_store_ripdm_readable & m_nevpt2 & m_conventional_nevpt2 & m_kept_nevpt2_states & NevPrint;
   }
@@ -443,6 +446,7 @@ class Input {
   const bool &no_transform() const { return m_no_transform; }
   const int &deflation_min_size() const { return m_deflation_min_size; }
   const bool &direct() const { return m_direct; }
+  const bool &prebuild() const {return m_prebuild; }
   const int &deflation_max_size() const { return m_deflation_max_size; }
   const IrrepSpace &total_symmetry_number() const { return m_total_symmetry_number; }
   const IrrepSpace &bra_symmetry_number() const { return m_bra_symmetry_number; }
@@ -524,6 +528,8 @@ class Input {
   bool &spatpdm_disk_dump() {return m_spatpdm_disk_dump;}
   const bool &pdm_unsorted() const {return m_pdm_unsorted;}
   bool &pdm_unsorted(){return m_pdm_unsorted;}
+  const std::vector<int> &specificpdm() const {return m_specificpdm;}
+  std::vector<int> &specificpdm() {return m_specificpdm;}
   const bool &store_nonredundant_pdm() const { return m_store_nonredundant_pdm;}
   bool &store_nonredundant_pdm() { return m_store_nonredundant_pdm;}
   int slater_size() const {return m_norbs;}
@@ -533,6 +539,7 @@ class Input {
   const bool &npdm_intermediate() const { return m_npdm_intermediate; }
   bool &npdm_multinode() { return m_npdm_multinode; }
   const bool &npdm_multinode() const { return m_npdm_multinode; }
+  int bra_M() const {return m_bra_M;}
 };
 }
 #endif
diff --git a/modules/ResponseTheory/sweepCompress.C b/modules/ResponseTheory/sweepCompress.C
index 20cf4ec..5358953 100644
--- a/modules/ResponseTheory/sweepCompress.C
+++ b/modules/ResponseTheory/sweepCompress.C
@@ -182,7 +182,12 @@ void SpinAdapted::SweepCompress::BlockDecimateAndCompress (SweepParams &sweepPar
   kettracedMatrix.makedensitymatrix(solution, newbig, dmrginp.weights(sweepiter), 0.0, 0.0, true);
   double braerror, keterror;
   if (!mpigetrank()) {
-    keterror = makeRotateMatrix(kettracedMatrix, ketrotateMatrix, newbig.get_rightBlock()->get_ketStateInfo().totalStates, 0);
+    if (dmrginp.hamiltonian() == BCS) {
+      int largeNumber = 1000000;
+      keterror = makeRotateMatrix(kettracedMatrix, ketrotateMatrix, largeNumber, 0);
+    } else {
+      keterror = makeRotateMatrix(kettracedMatrix, ketrotateMatrix, newbig.get_rightBlock()->get_ketStateInfo().totalStates, 0);
+    }
     braerror = makeRotateMatrix(bratracedMatrix, brarotateMatrix, sweepParams.get_keep_states(), sweepParams.get_keep_qstates());
   }
   kettracedMatrix.deallocate();
@@ -484,8 +489,14 @@ void SpinAdapted::SweepCompress::Startup (SweepParams &sweepParams, StackSpinBlo
 
   double keterror, braerror;
   if (!mpigetrank()) {
-    keterror = makeRotateMatrix(kettracedMatrix, ketrotateMatrix, newbig.get_rightBlock()->get_ketStateInfo().totalStates, 0);
-    braerror = makeRotateMatrix(bratracedMatrix, brarotateMatrix, newbig.get_rightBlock()->get_braStateInfo().totalStates, 0);
+    if (dmrginp.hamiltonian() == BCS) {
+      int largeNumber = 1000000;
+      keterror = makeRotateMatrix(kettracedMatrix, ketrotateMatrix, largeNumber, 0);
+      braerror = makeRotateMatrix(bratracedMatrix, brarotateMatrix, largeNumber, 0);
+    } else {
+      keterror = makeRotateMatrix(kettracedMatrix, ketrotateMatrix, newbig.get_rightBlock()->get_ketStateInfo().totalStates, 0);
+      braerror = makeRotateMatrix(bratracedMatrix, brarotateMatrix, newbig.get_rightBlock()->get_braStateInfo().totalStates, 0);
+    }
   }
 #ifndef SERIAL
   mpi::communicator world;
@@ -636,9 +647,6 @@ void SpinAdapted::SweepCompress::WavefunctionCanonicalize (SweepParams &sweepPar
   //***********
   davidson_f(solution[0], outputState[0]);
 
-  pout << solution[0]<<endl;
-  pout << outputState[0]<<endl;
-  
   std::vector<Matrix> ketrotateMatrix, brarotateMatrix;
   StackDensityMatrix bratracedMatrix(newSystem.get_braStateInfo()), kettracedMatrix(newSystem.get_ketStateInfo());
   bratracedMatrix.allocate(newSystem.get_braStateInfo());
diff --git a/modules/ResponseTheory/sweepResponse.C b/modules/ResponseTheory/sweepResponse.C
index 1ef9870..6265d84 100644
--- a/modules/ResponseTheory/sweepResponse.C
+++ b/modules/ResponseTheory/sweepResponse.C
@@ -563,29 +563,30 @@ double SpinAdapted::SweepResponse::do_one(SweepParams &sweepParams, const bool &
 
     system.set_twoInt(activeSpaceIntegral);
 
-    for (int l=0; l<projectors.size(); l++)
+      for (int l=0; l<baseStates.size(); l++)
     {
       StackSpinBlock perturbationSystem;
-      perturbationSystem.set_integralIndex() = 0;
+      perturbationSystem.set_integralIndex() = l+1;
       //if (sweepParams.get_sweep_iter() == 0)
-      //InitBlocks::InitStartingBlock (perturbationSystem,forward, targetState, projectors[l],
+      //InitBlocks::InitStartingBlock (overlapSystem,forward, targetState, baseStates[l],
       //			       sweepParams.get_forward_starting_size(), sweepParams.get_backward_starting_size(), 
-      //			       restartSize, restart, warmUp, 0);
+      //			       restartSize, restart, warmUp, perturbationIntegral[l]);
       //else
-      StackSpinBlock::restore (forward, sites, perturbationSystem, targetState, projectors[l]);
-      StackSpinBlock::store (forward, system.get_sites(), perturbationSystem, targetState, projectors[l]);
+      StackSpinBlock::restore (forward, sites, perturbationSystem, targetState, baseStates[l]);
+      StackSpinBlock::store (forward, system.get_sites(), perturbationSystem, targetState, baseStates[l]);
     }
-    for (int l=0; l<baseStates.size(); l++)
+
+    for (int l=0; l<projectors.size(); l++)
     {
       StackSpinBlock overlapSystem;
-      overlapSystem.set_integralIndex() = l+1;
+      overlapSystem.set_integralIndex() = 0;
       //if (sweepParams.get_sweep_iter() == 0)
-      //InitBlocks::InitStartingBlock (overlapSystem,forward, targetState, baseStates[l],
+      //InitBlocks::InitStartingBlock (perturbationSystem,forward, targetState, projectors[l],
       //			       sweepParams.get_forward_starting_size(), sweepParams.get_backward_starting_size(), 
-      //			       restartSize, restart, warmUp, perturbationIntegral[l]);
+      //			       restartSize, restart, warmUp, 0);
       //else
-      StackSpinBlock::restore (forward, sites, overlapSystem, targetState, baseStates[l]);
-      StackSpinBlock::store (forward, system.get_sites(), overlapSystem, targetState, baseStates[l]);
+      StackSpinBlock::restore (forward, sites, overlapSystem, targetState, projectors[l]);
+      StackSpinBlock::store (forward, system.get_sites(), overlapSystem, targetState, projectors[l]);
     }
 
   }
@@ -593,25 +594,25 @@ double SpinAdapted::SweepResponse::do_one(SweepParams &sweepParams, const bool &
     InitBlocks::InitStartingBlock (system,forward, targetState, targetState,
 				   sweepParams.get_forward_starting_size(), sweepParams.get_backward_starting_size(), 
 				   restartSize, restart, warmUp, activeSpaceIntegral);
-  
-    for (int l=0; l<projectors.size(); l++)
+ 
+    for (int l=0; l<baseStates.size(); l++)
     {
       StackSpinBlock perturbationSystem;
-      perturbationSystem.set_integralIndex() = 0;
-      InitBlocks::InitStartingBlock (perturbationSystem,forward, targetState, projectors[l],
+      perturbationSystem.set_integralIndex() = l+1;
+      InitBlocks::InitStartingBlock (perturbationSystem,forward, targetState, baseStates[l],
 				     sweepParams.get_forward_starting_size(), sweepParams.get_backward_starting_size(), 
-				     restartSize, restart, warmUp, 0);
-      StackSpinBlock::store (forward, system.get_sites(), perturbationSystem, targetState, projectors[l]);
+				     restartSize, restart, warmUp, perturbationIntegral[l]);
+      StackSpinBlock::store (forward, system.get_sites(), perturbationSystem, targetState, baseStates[l]);
     }
-    
-    for (int l=0; l<baseStates.size(); l++)
+ 
+    for (int l=0; l<projectors.size(); l++)
     {
       StackSpinBlock overlapSystem;
-      overlapSystem.set_integralIndex() = l+1;
-      InitBlocks::InitStartingBlock (overlapSystem,forward, targetState, baseStates[l],
+      overlapSystem.set_integralIndex() = 0;
+      InitBlocks::InitStartingBlock (overlapSystem,forward, targetState, projectors[l],
 				     sweepParams.get_forward_starting_size(), sweepParams.get_backward_starting_size(), 
-				     restartSize, restart, warmUp, perturbationIntegral[l]);
-      StackSpinBlock::store (forward, system.get_sites(), overlapSystem, targetState, baseStates[l]);
+				     restartSize, restart, warmUp, 0);
+      StackSpinBlock::store (forward, system.get_sites(), overlapSystem, targetState, projectors[l]);
     }
   }
 
diff --git a/modules/npdm/npdm.C b/modules/npdm/npdm.C
index 8f41c77..22eb2a5 100644
--- a/modules/npdm/npdm.C
+++ b/modules/npdm/npdm.C
@@ -144,7 +144,10 @@ void npdm_block_and_decimate( Npdm_driver_base& npdm_driver, SweepParams &sweepP
     rotateMatrixB.clear();
     if (!mpigetrank()){
       double error = makeRotateMatrix(tracedMatrix, rotateMatrix, sweepParams.get_keep_states(), sweepParams.get_keep_qstates());
-      error = makeRotateMatrix(tracedMatrixB, rotateMatrixB, sweepParams.get_keep_states(), sweepParams.get_keep_qstates());
+      if (dmrginp.bra_M() == 0)
+        error = makeRotateMatrix(tracedMatrixB, rotateMatrixB, sweepParams.get_keep_states(), sweepParams.get_keep_qstates());
+      else
+        error = makeRotateMatrix(tracedMatrixB, rotateMatrixB, dmrginp.bra_M(), sweepParams.get_keep_qstates());
     }
     tracedMatrixB.deallocate();
     tracedMatrix.deallocate();
@@ -410,7 +413,54 @@ void npdm(NpdmOrder npdm_order, bool transitionpdm)
     }
   }
 
+  if (dmrginp.specificpdm().size()!=0) {
+    Timer timer;
+    dmrginp.set_fullrestart() = true;
+    dmrginp.npdm_generate() = true;
+    sweepParams = sweep_copy; direction = direction_copy; restartsize = restartsize_copy;
+    if (dmrginp.specificpdm().size()==1) {
+      dmrginp.setimplicitTranspose() = true;
+      SweepGenblock::do_one(sweepParams, false, !direction, false, 0, dmrginp.specificpdm()[0], dmrginp.specificpdm()[0]);
+    } else if (dmrginp.specificpdm().size()==2) {
+      dmrginp.setimplicitTranspose() = false;
+      SweepGenblock::do_one(sweepParams, false, !direction, false, 0, dmrginp.specificpdm()[0], dmrginp.specificpdm()[1]);
+    } else
+      abort();
+    dmrginp.npdm_generate() = false;
 
+    double cputime = timer.elapsedcputime();
+    p3out << "\t\t\t NPDM SweepGenblock time " << timer.elapsedwalltime() << " " << cputime << endl;
+    dmrginp.set_fullrestart() = false;
+
+    sweepParams = sweep_copy; direction = direction_copy; restartsize = restartsize_copy;
+    Timer timerX;
+
+    npdm_driver->clear();
+    if (dmrginp.specificpdm().size() == 1)
+      npdm_do_one_sweep(*npdm_driver, sweepParams, false, direction, false, 0, dmrginp.specificpdm()[0], dmrginp.specificpdm()[0]);
+    else if (dmrginp.specificpdm().size() == 2)
+      npdm_do_one_sweep(*npdm_driver, sweepParams, false, direction, false, 0, dmrginp.specificpdm()[0], dmrginp.specificpdm()[1]);
+    else
+      abort();
+    cputime = timerX.elapsedcputime();
+    p3out << "\t\t\t NPDM sweep time " << timerX.elapsedwalltime() << " " << cputime << endl;
+
+    if (dmrginp.hamiltonian() == BCS && npdm_order == NPDM_ONEPDM) {
+      Timer timerX1;
+      boost::shared_ptr<Npdm_driver_base> npdm_driver1 = boost::shared_ptr<Npdm_driver_base>( new Pairpdm_driver( dmrginp.last_site() ) );
+      npdm_driver1->clear();
+
+      if (dmrginp.specificpdm().size() == 1)
+        npdm_do_one_sweep(*npdm_driver1, sweepParams, false, direction, false, 0,  dmrginp.specificpdm()[0], dmrginp.specificpdm()[0]);
+      else if (dmrginp.specificpdm().size() == 2)
+        npdm_do_one_sweep(*npdm_driver1, sweepParams, false, direction, false, 0,  dmrginp.specificpdm()[0], dmrginp.specificpdm()[1]);
+      else
+        abort();
+      cputime = timerX1.elapsedcputime();
+      p3out << "\t\t\t NPDM sweep time " << timerX1.elapsedwalltime() << " " << cputime << endl;
+    }
+    return;
+  }
 
   if(dmrginp.transition_diff_irrep()){
     // It is used when bra and ket has different spatial irrep
@@ -474,9 +524,9 @@ void npdm(NpdmOrder npdm_order, bool transitionpdm)
 
           if (dmrginp.hamiltonian() == BCS && npdm_order == NPDM_ONEPDM) {
             Timer timerX1;            
-            npdm_driver = boost::shared_ptr<Npdm_driver_base>( new Pairpdm_driver( dmrginp.last_site() ) );
-            npdm_driver->clear();
-            npdm_do_one_sweep(*npdm_driver, sweepParams, false, !direction, false, 0, state,stateB);
+            boost::shared_ptr<Npdm_driver_base> npdm_driver1 = boost::shared_ptr<Npdm_driver_base>( new Pairpdm_driver( dmrginp.last_site() ) );
+            npdm_driver1->clear();
+            npdm_do_one_sweep(*npdm_driver1, sweepParams, false, direction, false, 0, state,stateB);
             cputime = timerX1.elapsedcputime();
             p3out << "\t\t\t NPDM sweep time " << timerX1.elapsedwalltime() << " " << cputime << endl; 
           }
@@ -495,9 +545,9 @@ void npdm(NpdmOrder npdm_order, bool transitionpdm)
           p3out << "\t\t\t NPDM sweep time " << timerX.elapsedwalltime() << " " << cputime << endl;
           if (dmrginp.hamiltonian() == BCS && npdm_order == NPDM_ONEPDM) {
             Timer timerX1;            
-            npdm_driver = boost::shared_ptr<Npdm_driver_base>( new Pairpdm_driver( dmrginp.last_site() ) );
-            npdm_driver->clear();
-            npdm_do_one_sweep(*npdm_driver, sweepParams, false, !direction, false, 0, state,state);
+            boost::shared_ptr<Npdm_driver_base> npdm_driver1 = boost::shared_ptr<Npdm_driver_base>( new Pairpdm_driver( dmrginp.last_site() ) );
+            npdm_driver1->clear();
+            npdm_do_one_sweep(*npdm_driver1, sweepParams, false, direction, false, 0, state,state);
             cputime = timerX1.elapsedcputime();
             p3out << "\t\t\t NPDM sweep time " << timerX1.elapsedwalltime() << " " << cputime << endl; 
           }
@@ -556,6 +606,15 @@ void npdm(NpdmOrder npdm_order, bool transitionpdm)
         npdm_do_one_sweep(*npdm_driver, sweepParams, false, direction, false, 0, state,stateB);
         cputime = timerX.elapsedcputime();
         p3out << "\t\t\t NPDM sweep time " << timerX.elapsedwalltime() << " " << cputime << endl;
+
+        if (dmrginp.hamiltonian() == BCS && npdm_order == NPDM_ONEPDM) {
+          Timer timerX1;
+            boost::shared_ptr<Npdm_driver_base> npdm_driver1 = boost::shared_ptr<Npdm_driver_base>( new Pairpdm_driver( dmrginp.last_site() ) );
+            npdm_driver1->clear();
+          npdm_do_one_sweep(*npdm_driver1, sweepParams, false, direction, false, 0, state, stateB);
+          cputime = timerX1.elapsedcputime();
+          p3out << "\t\t\t NPDM sweep time " << timerX1.elapsedwalltime() << " " << cputime << endl; 
+        }
       }
     }
   }
@@ -579,6 +638,12 @@ void npdm(NpdmOrder npdm_order, bool transitionpdm)
       // Do NPDM sweep
       npdm_driver->clear();
       npdm_do_one_sweep(*npdm_driver, sweepParams, false, direction, false, 0, state,state);
+
+      if (dmrginp.hamiltonian() == BCS && npdm_order == NPDM_ONEPDM) {
+            boost::shared_ptr<Npdm_driver_base> npdm_driver1 = boost::shared_ptr<Npdm_driver_base>( new Pairpdm_driver( dmrginp.last_site() ) );
+            npdm_driver1->clear();
+        npdm_do_one_sweep(*npdm_driver1, sweepParams, false, direction, false, 0, state, state);
+      }
    }
   }
   
diff --git a/saveBlock.C b/saveBlock.C
index c3a3b2c..6396c03 100644
--- a/saveBlock.C
+++ b/saveBlock.C
@@ -337,12 +337,13 @@ void StackSpinBlock::sendcompOps(StackOp_component_base& opcomp, int I, int J, i
 #ifndef SERIAL
   boost::mpi::communicator world;
   std::vector<boost::shared_ptr<StackSparseMatrix> > oparray = opcomp.get_element(I,J);
+  int nsites = dmrginp.last_site();
   for(int i=0; i<oparray.size(); i++) {
-    calc.send(processorindex(compsite), optype+i*10+1000*J+100000*I, *oparray[i]);
+    calc.send(processorindex(compsite), ((100*optype+i)*nsites+J)*nsites+I, *oparray[i]);
 
     //now broadcast the data
     //MPI::COMM_WORLD.Bcast(oparray[i]->get_data(), oparray[i]->memoryUsed(), MPI_DOUBLE, trimap_2d(I, J, dmrginp.last_site()));
-    MPI_Send(oparray[i]->get_data(), oparray[i]->memoryUsed(), MPI_DOUBLE, processorindex(compsite), optype+i*10+1000*J+100000*I, Calc);
+    MPI_Send(oparray[i]->get_data(), oparray[i]->memoryUsed(), MPI_DOUBLE, processorindex(compsite), ((100*optype+i)*nsites+J)*nsites+I, Calc);
   }  
 #endif
 }
@@ -352,8 +353,9 @@ void StackSpinBlock::recvcompOps(StackOp_component_base& opcomp, int I, int J, i
 #ifndef SERIAL
   boost::mpi::communicator world;
   std::vector<boost::shared_ptr<StackSparseMatrix> > oparray = opcomp.get_element(I,J);
+  int nsites = dmrginp.last_site();
   for(int i=0; i<oparray.size(); i++) {
-    calc.recv(processorindex(trimap_2d(I, J, dmrginp.last_site())), optype+i*10+1000*J+100000*I, *oparray[i]);
+    calc.recv(processorindex(trimap_2d(I, J, nsites)), ((100*optype+i)*nsites+J)*nsites+I, *oparray[i]);
 
     double* data = Stackmem[omprank].allocate(oparray[i]->memoryUsed());
     if (additionalMemory == 0) 
@@ -364,7 +366,7 @@ void StackSpinBlock::recvcompOps(StackOp_component_base& opcomp, int I, int J, i
     oparray[i]->allocateOperatorMatrix();
 
     //now broadcast the data
-    MPI_Recv(oparray[i]->get_data(), oparray[i]->memoryUsed(), MPI_DOUBLE, processorindex(trimap_2d(I, J, dmrginp.last_site())), optype+i*10+1000*J+100000*I, Calc,MPI_STATUS_IGNORE);
+    MPI_Recv(oparray[i]->get_data(), oparray[i]->memoryUsed(), MPI_DOUBLE, processorindex(trimap_2d(I, J, dmrginp.last_site())),((100*optype+i)*nsites+J)*nsites+I, Calc,MPI_STATUS_IGNORE);
   }
 #endif
 }
diff --git a/solver.C b/solver.C
index 0906415..2e47142 100644
--- a/solver.C
+++ b/solver.C
@@ -126,26 +126,25 @@ void SpinAdapted::Solver::solve_wavefunction(vector<StackWavefunction>& solution
 
       guessWaveTypes guesstype = guesswavetype;
       if (guesswavetype == TRANSPOSE && big.get_leftBlock()->get_rightBlock() == 0)
-	guesstype = BASIC;
+	      guesstype = BASIC;
       GuessWave::guess_wavefunctions(solution, e, big, guesstype, onedot, dot_with_sys, nroots, additional_noise, currentRoot); 
       dmrginp.guesswf->stop();
       
       for (int istate=0; istate<lowerStates.size(); istate++)  {
-	for (int jstate=istate+1; jstate<lowerStates.size(); jstate++) {
-	  double overlap = DotProduct(lowerStates[istate], lowerStates[jstate]);
-	  ScaleAdd(-overlap/DotProduct(lowerStates[istate], lowerStates[istate]), lowerStates[istate], lowerStates[jstate]);
-	}
+	      for (int jstate=istate+1; jstate<lowerStates.size(); jstate++) {
+	        double overlap = DotProduct(lowerStates[istate], lowerStates[jstate]);
+	        ScaleAdd(-overlap/DotProduct(lowerStates[istate], lowerStates[istate]), lowerStates[istate], lowerStates[jstate]);
+	      }
       }
     
       dmrginp.blockdavid->start();
       Linear::block_davidson(solution, e, tol, warmUp, *davidson_f, useprecond, currentRoot, lowerStates);
       dmrginp.blockdavid->stop();
-
+      delete davidson_f;
       if (mpigetrank() == 0) {
-	for (int i=solution.size()-1; i>=nroots; i--) 
-	  solution[i].deallocate();
+	      for (int i=solution.size()-1; i>=nroots; i--) 
+	        solution[i].deallocate();
       }
-      delete davidson_f;
     }
     else if (dmrginp.solve_method() == CONJUGATE_GRADIENT) {
 
@@ -168,18 +167,14 @@ void SpinAdapted::Solver::solve_wavefunction(vector<StackWavefunction>& solution
 	solution[0].Clear();
 
       double functional = Linear::MinResMethod(solution[0], tol, *davidson_f, lowerStates);
+      delete davidson_f;      
       //double functional = Linear::ConjugateGradient(solution[0], tol, davidson_f, lowerStates);
       if (mpigetrank() == 0)
-	e(1) = functional;
-      delete davidson_f;
+	      e(1) = functional;
     }
     else {
       pout << "Lanczos is no longer supported"<<endl;
       abort();
-      solution.resize(1);
-      multiply_h davidson_f(big, onedot);
-      //*************************
-      //GuessWave::guess_wavefunctions(solution, e, big, guesswavetype, onedot, dot_with_sys, additional_noise, currentRoot); 
     }
   }
 
diff --git a/stackguess_wavefunction.C b/stackguess_wavefunction.C
index e7e288d..4c46648 100644
--- a/stackguess_wavefunction.C
+++ b/stackguess_wavefunction.C
@@ -183,7 +183,11 @@ void GuessWave::transpose_previous_wavefunction(StackWavefunction& trial, const
   StateInfo intermediate1, intermediate2;
   TensorProduct(*stateInfo.rightStateInfo, *stateInfo.leftStateInfo->rightStateInfo, intermediate1, NO_PARTICLE_SPIN_NUMBER_CONSTRAINT);
   intermediate1.CollectQuanta();
-  TensorProduct(intermediate1, *stateInfo.leftStateInfo->leftStateInfo, intermediate2, PARTICLE_SPIN_NUMBER_CONSTRAINT);
+  if (dmrginp.hamiltonian() == BCS) {
+    TensorProduct(intermediate1, *stateInfo.leftStateInfo->leftStateInfo, intermediate2, SPIN_NUMBER_CONSTRAINT);
+  } else {
+    TensorProduct(intermediate1, *stateInfo.leftStateInfo->leftStateInfo, intermediate2, PARTICLE_SPIN_NUMBER_CONSTRAINT);
+  }
   onedot_transpose_wavefunction(intermediate2, stateInfo, oldWave, trial);
   oldStateInfo.Free();
   oldWave.deallocate();
@@ -198,7 +202,7 @@ void GuessWave::onedot_transpose_wavefunction(const StateInfo& guessstateinfo, c
 					      const StackWavefunction& guesswf, StackWavefunction& transposewf)
 {
   ObjectMatrix3D< vector<Matrix> > threewave;
-  // first convert into three index wavefunction [s.][e] -> [s].[e]                                                                       
+  // first convert into three index wavefunction [s.][e] -> [s].[e]
   onedot_twoindex_to_threeindex_wavefunction(guessstateinfo, guesswf, threewave);
   ObjectMatrix3D< vector<Matrix> > threewavetranspose(threewave.NDim2(), threewave.NDim1(), threewave.NDim0());
 
@@ -543,7 +547,7 @@ void GuessWave::onedot_twoindex_to_threeindex_shufflesysdot(const StateInfo& sta
 void GuessWave::transform_previous_wavefunction(StackWavefunction& trial, const StateInfo& stateInfo, const std::vector<int> &leftsites, const std::vector<int> &rightsites, const int state, const bool &onedot, const bool& transpose_guess_wave)
 {
   p2out << "\t\t\t Transforming previous wavefunction " << endl;
-  
+
   ObjectMatrix3D< vector<Matrix> > oldTrialWavefunction;
   ObjectMatrix3D< vector<Matrix> > newTrialWavefunction;
   StateInfo oldStateInfo;
@@ -580,7 +584,7 @@ void GuessWave::transform_previous_wavefunction(StackWavefunction& trial, const
 void GuessWave::transform_previous_wavefunction(StackWavefunction& trial, const StackSpinBlock &big, const int state, const bool &onedot, const bool& transpose_guess_wave)
 {
   p2out << "\t\t\t Transforming previous wavefunction " << endl;
-  
+
   ObjectMatrix3D< vector<Matrix> > oldTrialWavefunction;
   ObjectMatrix3D< vector<Matrix> > newTrialWavefunction;
   StateInfo oldStateInfo;
@@ -597,7 +601,6 @@ void GuessWave::transform_previous_wavefunction(StackWavefunction& trial, const
     oldWave.LoadWavefunctionInfo (oldStateInfo, big.get_leftBlock()->get_sites(), state, true);
     LoadRotationMatrix (big.get_leftBlock()->get_sites(), leftRotationMatrix, state);
   }
-
   for (int q = 0; q < leftRotationMatrix.size (); ++q)
   {
     if (leftRotationMatrix [q].Nrows () > 0)
@@ -615,7 +618,6 @@ void GuessWave::transform_previous_wavefunction(StackWavefunction& trial, const
     tempoldWave.Clear();
 
     TransformLeftBlock(oldWave, big.get_stateInfo(), leftRotationMatrix, tempoldWave);
-
     StateInfo tempoldStateInfo;
     if (dmrginp.hamiltonian() == BCS)
       TensorProduct (*(big.get_stateInfo().leftStateInfo->leftStateInfo), *oldStateInfo.rightStateInfo, tempoldStateInfo,
@@ -640,7 +642,6 @@ void GuessWave::transform_previous_wavefunction(StackWavefunction& trial, const
     
     tempnewStateInfo.CollectQuanta();
     onedot_shufflesysdot(tempoldStateInfo, tempnewStateInfo, tempoldWave, tempnewWave);
-
     LoadRotationMatrix (big.get_rightBlock()->get_sites(), rightRotationMatrix, state);
 
     TransformRightBlock(tempnewWave, oldStateInfo, rightRotationMatrix, trial);
@@ -664,6 +665,7 @@ void GuessWave::transform_previous_wavefunction(StackWavefunction& trial, const
     onedot_transform_wavefunction(oldStateInfo, big.get_stateInfo(), oldWave, leftRotationMatrix, rightRotationMatrix, trial, transpose_guess_wave);
   }
 
+
   oldStateInfo.Free();
   oldWave.deallocate();
   double norm = DotProduct(trial, trial);
@@ -720,7 +722,11 @@ void GuessWave::transform_previous_twodot_to_onedot_wavefunction(StackWavefuncti
   trial.set_fermion() = false;
   trial.set_onedot(true);
 
-  TensorProduct( *big.get_stateInfo().leftStateInfo->leftStateInfo, *oldStateInfo.rightStateInfo,  tempoldStateInfo, PARTICLE_SPIN_NUMBER_CONSTRAINT);
+  if (dmrginp.hamiltonian() == BCS) {
+    TensorProduct( *big.get_stateInfo().leftStateInfo->leftStateInfo, *oldStateInfo.rightStateInfo,  tempoldStateInfo, SPIN_NUMBER_CONSTRAINT);
+  } else {
+    TensorProduct( *big.get_stateInfo().leftStateInfo->leftStateInfo, *oldStateInfo.rightStateInfo,  tempoldStateInfo, PARTICLE_SPIN_NUMBER_CONSTRAINT);
+  }
   onedot_shufflesysdot(  tempoldStateInfo, big.get_stateInfo(), tmpwavefunction, trial);
 
   tmpwavefunction.deallocate();
@@ -735,7 +741,6 @@ void GuessWave::transform_previous_wavefunction(StackWavefunction& trial, const
 // ket determines use braSateinfo or ketStateinfo to guess_wavefunctions.
 {
   p2out << "\t\t\t Transforming previous wavefunction " << endl;
-  
   ObjectMatrix3D< vector<Matrix> > oldTrialWavefunction;
   ObjectMatrix3D< vector<Matrix> > newTrialWavefunction;
   StateInfo oldStateInfo;
@@ -877,28 +882,27 @@ void GuessWave::onedot_transform_wavefunction(const StateInfo& oldstateinfo, con
     aSz = newstateinfo.leftStateInfo->quanta.size (); 
     cSz = newstateinfo.rightStateInfo->quanta.size();
   }
-
   ObjectMatrix<Matrix> tmp(oldASz, cSz); //cSz
   for (int a = 0; a < oldASz; ++a)
     for (int c = 0; c < oldCSz; ++c) // oldCSz <= cSz
     {
       if (oldwavefunction.allowed(a,c)) {
-	const StackMatrix& oM = oldwavefunction.operator_element(a, c);
-
-	int transC = oldstateinfo.rightStateInfo->newQuantaMap [c];
-
-	Matrix& tM = tmp (a, transC);
-	Matrix rM = rightRotationMatrix[transC];
-	tM.ReSize (oM.Nrows (), rM.Nrows ());
-	
-	double parity = 1.;
-	
-	SpinAdapted::Clear (tM);
-	// we have the rotation matrices C_r'r, C_l'l, and we are transforming the wavefunction
-	// coefficient d_lr -> consequently, we need to multiply by the transpose, but NOT hermitian
-	// conjugate, of rM
-	rM = rM.t();
-	MatrixMultiply (oM, 'n', rM, 'n', tM, parity); // phi -> c psi : phi d -> psi c c^t d : consequently compute c^t d 
+        const StackMatrix& oM = oldwavefunction.operator_element(a, c);
+
+        int transC = oldstateinfo.rightStateInfo->newQuantaMap [c];
+
+        Matrix& tM = tmp (a, transC);
+        Matrix rM = rightRotationMatrix[transC];
+        tM.ReSize (oM.Nrows (), rM.Nrows ());
+
+        double parity = 1.;
+
+        SpinAdapted::Clear (tM);
+        // we have the rotation matrices C_r'r, C_l'l, and we are transforming the wavefunction
+        // coefficient d_lr -> consequently, we need to multiply by the transpose, but NOT hermitian
+        // conjugate, of rM
+        rM = rM.t();
+        MatrixMultiply (oM, 'n', rM, 'n', tM, parity); // phi -> c psi : phi d -> psi c c^t d : consequently compute c^t d 
       }
     }
 
diff --git a/stackopxop.C b/stackopxop.C
index 7fff2c3..b46bd8f 100644
--- a/stackopxop.C
+++ b/stackopxop.C
@@ -38,7 +38,7 @@ void SpinAdapted::stackopxop::cdxcdcomp(const StackSpinBlock* otherblock, std::v
       if (otherblock->has(DES_CRECOMP)) {
 	      op1 = loopblock->get_op_array(DES_CRE).get_element(i,j).at(opind);
 	      double parity = 1.0;
-	      if (dmrginp.spinAdapted()  && dmrginp.hamiltonian() != BCS)
+	      if (dmrginp.spinAdapted())
 	        parity = getCommuteParity(-getSpinQuantum(i), getSpinQuantum(j), op1->get_deltaQuantum()[0]);
 	      op2 = otherblock->get_op_array(DES_CRECOMP).get_element(i,j).at(opind);
 	      SpinAdapted::operatorfunctions::TensorProduct(otherblock, *op2, *op1, b, &(b->get_stateInfo()), o[ilock], factor*parity, numthreads);
diff --git a/sweep_mps.C b/sweep_mps.C
index 9ccb4dc..81b0fdd 100644
--- a/sweep_mps.C
+++ b/sweep_mps.C
@@ -45,7 +45,7 @@ void SpinAdapted::Sweep::CanonicalizeWavefunction(SweepParams &sweepParams, cons
   sweepParams.set_block_iter() = 0;
 
   std::vector<int> sites;
-  int new_site, wave_site;
+  int new_site;
   if (forward) {
     pout << "\t\t\t Starting sweep "<< sweepParams.set_sweep_iter()<<" in forwards direction"<<endl;
     new_site = 0;
@@ -76,12 +76,10 @@ void SpinAdapted::Sweep::CanonicalizeWavefunction(SweepParams &sweepParams, cons
     
     if (forward) {
       new_site++;
-      wave_site = new_site+1;
       p1out << "\t\t\t Current direction is :: Forwards " << endl;
     }
     else {
       new_site--;
-      wave_site = new_site-1;
       p1out << "\t\t\t Current direction is :: Backwards " << endl;
     }
     std::vector<int> complementarySites, spindotsites(1, new_site), oldsites = sites, oldcomplement;
@@ -114,34 +112,39 @@ void SpinAdapted::Sweep::CanonicalizeWavefunction(SweepParams &sweepParams, cons
     else
       StateInfo::restore(!forward, complementarySites, envstate, currentstate);
 
-    TensorProduct(newState1, envstate, bigstate, PARTICLE_SPIN_NUMBER_CONSTRAINT);
+    if (dmrginp.hamiltonian() == BCS) {
+      TensorProduct(newState1, envstate, bigstate, SPIN_NUMBER_CONSTRAINT);
+    } else {
+      TensorProduct(newState1, envstate, bigstate, PARTICLE_SPIN_NUMBER_CONSTRAINT);
+    }
 
     w.initialise(dmrginp.effective_molecule_quantum_vec(), newState1, envstate, true);
     w.Clear();
+
     if (sweepParams.get_block_iter() == 0) 
       GuessWave::transpose_previous_wavefunction(w, bigstate, complementarySites, spindotsites, currentstate, true, true);
     else 
       GuessWave::transform_previous_wavefunction(w, bigstate, oldsites, oldcomplement, currentstate, true, true);
 
-    
     w.SaveWavefunctionInfo(bigstate, sites, currentstate);
-      
+
     //make the newstate
     std::vector<Matrix> rotation1; 
-      
-      
+
     StackDensityMatrix tracedMatrix(*bigstate.leftStateInfo);
     tracedMatrix.allocate(*bigstate.leftStateInfo);
-    operatorfunctions::MultiplyWithOwnTranspose (w, tracedMatrix, 1.0);  
+    operatorfunctions::MultiplyWithOwnTranspose (w, tracedMatrix, 1.0);
 
     int largeNumber = 1000000;
     if (!mpigetrank())
       double error = makeRotateMatrix(tracedMatrix, rotation1, largeNumber, sweepParams.get_keep_qstates());
     SaveRotationMatrix (sites, rotation1, currentstate);
+    //SaveRotationMatrix (sites, rotation1, -1);
 
     StateInfo renormState1;
     SpinAdapted::StateInfo::transform_state(rotation1, newState1, renormState1);
     StateInfo::store(forward, sites, renormState1, currentstate);
+    //StateInfo::store(forward, sites, renormState1, -1);
     stateInfo1 = renormState1;
     ++sweepParams.set_block_iter();
 
@@ -236,7 +239,11 @@ void SpinAdapted::Sweep::CanonicalizeWavefunctionPartialSweep(SweepParams &sweep
     else
       StateInfo::restore(!forward, complementarySites, envstate, currentstate);
 
-    TensorProduct(newState1, envstate, bigstate, PARTICLE_SPIN_NUMBER_CONSTRAINT);
+    if (dmrginp.hamiltonian() == BCS) {
+      TensorProduct(newState1, envstate, bigstate, SPIN_NUMBER_CONSTRAINT);
+    } else {
+      TensorProduct(newState1, envstate, bigstate, PARTICLE_SPIN_NUMBER_CONSTRAINT);
+    }
 
     w.initialise(dmrginp.effective_molecule_quantum_vec(), newState1, envstate, true);    w.Clear();
 
diff --git a/wrapper.C b/wrapper.C
index 4dc14d2..c6b96b6 100644
--- a/wrapper.C
+++ b/wrapper.C
@@ -245,16 +245,16 @@ void test(char* infile)
   std::vector< std::vector<double> > Overlap(nstates, std::vector<double>(nstates, 0.0));
 
 
-  for (int i=1; i<nstates; i++) {
-    if(mpigetrank() == 0)
-      printf("starting row : %i\n", i);
-    for (int j=0; j<1; j++) {
+  for (int i=0; i<nstates; i++) {
+    //if(mpigetrank() == 0)
+    //  printf("starting row : %i\n", i);
+    for (int j=0; j<=i; j++) {
       double h=0,o=0;
       calcHamiltonianAndOverlap(states[i], states[j], h, o, i==j, 0);
       ham[i][j] = h; ham[j][i] = h;
       Overlap[i][j] = o; Overlap[j][i] = o;
       if (mpigetrank() == 0) 
-	printf("%i %i  %18.9e  %18.9e\n", i, j, h, o); 
+        printf("%i %i  %18.9e  %18.9e\n", i, j, h, o); 
     }
   }