diff --git a/AMDiS/src/parallel/PetscHelper.cc b/AMDiS/src/parallel/PetscHelper.cc
index ad3c35c00a62130111536b2b6c92a67a3e18c301..eaf175b8f2c8a347164a5a4389db92828821acec 100644
--- a/AMDiS/src/parallel/PetscHelper.cc
+++ b/AMDiS/src/parallel/PetscHelper.cc
@@ -11,6 +11,7 @@
 
 
 #include "parallel/PetscHelper.h"
+#include "Global.h"
 
 namespace AMDiS {
 
@@ -70,6 +71,104 @@ namespace AMDiS {
       VecAssemblyBegin(vec);
       VecAssemblyEnd(vec);
     }
+
+
+    void blockMatMatSolve(KSP ksp, Mat mat0, Mat &mat1)
+    {
+      // === We have to  calculate mat1 = ksp mat0:                       ===
+      // ===    - get all local column vectors from mat0                  ===
+      // ===    - solve with ksp for this column vector as the rhs vector ===
+      // ===    - set the result to the corresponding column vector of    ===
+      // ===      matrix mat1                                             ===
+      
+      PetscInt localRows, localCols, globalRows, globalCols;
+      MatGetLocalSize(mat0, &localRows, &localCols);
+      MatGetSize(mat0, &globalRows, &globalCols);
+
+      MatCreateAIJ(PETSC_COMM_WORLD,
+		   localRows, localCols, globalRows, globalCols,
+		   150, PETSC_NULL, 150, PETSC_NULL, &mat1);
+      MatSetOption(mat1, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
+      
+      // Transform matrix mat0 into a sparse column format.
+      PetscMatCol mat0_cols;
+      getMatLocalColumn(mat0, mat0_cols);
+      
+      Vec tmpVec;
+      VecCreateSeq(PETSC_COMM_SELF, localRows, &tmpVec);
+      
+      // Solve for all column vectors of mat A_BPi and create matrix matK
+      for (PetscMatCol::iterator it = mat0_cols.begin(); 
+	   it != mat0_cols.end(); ++it) {
+	getColumnVec(it->second, tmpVec);
+	KSPSolve(ksp, tmpVec, tmpVec);
+	setMatLocalColumn(mat1, it->first, tmpVec);
+      }
+      
+      VecDestroy(&tmpVec);
+      
+      MatAssemblyBegin(mat1, MAT_FINAL_ASSEMBLY);
+      MatAssemblyEnd(mat1, MAT_FINAL_ASSEMBLY);
+    }
+
+
+    void matNestConvert(Mat matNest, Mat &mat)  
+    {
+      FUNCNAME("matNestConvert()");
+
+      PetscInt nestRows, nestCols;
+      MatNestGetSize(matNest, &nestRows, &nestCols);
+
+      TEST_EXIT(nestRows == 2 && nestCols == 2)
+	("This funciton is only implemented for 2x2 nested matrices!\n");
+
+      Mat mat00, mat01, mat10, mat11;
+      MatNestGetSubMat(matNest, 0, 0, &mat00);
+      MatNestGetSubMat(matNest, 0, 1, &mat01);
+      MatNestGetSubMat(matNest, 1, 0, &mat10);
+      MatNestGetSubMat(matNest, 1, 1, &mat11);
+
+      int nRankRows0, nOverallRows0;
+      MatGetLocalSize(mat00, &nRankRows0, PETSC_NULL);
+      MatGetSize(mat00, &nOverallRows0, PETSC_NULL);
+
+      int nRankRows1, nOverallRows1;
+      MatGetLocalSize(mat11, &nRankRows1, PETSC_NULL);
+      MatGetSize(mat11, &nOverallRows1, PETSC_NULL);
+
+      int firstRow0;
+      MatGetOwnershipRange(mat00, &firstRow0, PETSC_NULL);
+
+      int firstRow1;
+      MatGetOwnershipRange(mat11, &firstRow1, PETSC_NULL);
+
+      int nRankRows = nRankRows0 + nRankRows1;
+      int nOverallRows = nOverallRows0 + nOverallRows1;
+      int firstRow = firstRow0 + firstRow1;
+
+      MatCreateAIJ(PETSC_COMM_WORLD, 
+		   nRankRows, nRankRows,
+		   nOverallRows, nOverallRows,
+		   10, PETSC_NULL, 10, PETSC_NULL, &mat);
+
+      for (int i = 0; i < nRankRows0; i++) {
+	PetscInt nCols;
+	const PetscInt *cols;
+	const PetscScalar *vals;
+
+	MatGetRow(mat00, i + firstRow0, &nCols, &cols, &vals);
+
+	for (int j = 0; j < nCols; j++) {
+	  MatSetValue(mat, firstRow + i, cols[j], vals[j], INSERT_VALUES);
+	}
+
+	MatRestoreRow(mat00, i + firstRow0, &nCols, &cols, &vals);
+      }
+
+      MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY);
+      MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY);
+    }
+
   }
-  
+
 }
diff --git a/AMDiS/src/parallel/PetscHelper.h b/AMDiS/src/parallel/PetscHelper.h
index 01edbc1daa046603a98ee2bc626f9b62c09821b4..9362aad873e912491ac7dce4ee25e7e6aaa3c0c0 100644
--- a/AMDiS/src/parallel/PetscHelper.h
+++ b/AMDiS/src/parallel/PetscHelper.h
@@ -69,6 +69,26 @@ namespace AMDiS {
      * \param[out]  vec      Vector representing one column of the matrix.
      */
     void getColumnVec(const SparseCol &matCol, Vec vec);
+
+    /** \brief
+     * Computes the matrix matrix product inv(A) B = C. Matrices B and C
+     * are distributed matrices. Matrix A is a local matrix on each MPI
+     * task. The overall number of rows of local matrices A must be the
+     * number of distriubted rows in B.
+     *
+     * \param[in]   ksp   inv(A) matrix given by a PETSc solver object.
+     * \param[in]   mat0  matrix B
+     * \param[out]  mat1  resulting matrix C, is created inside the function
+     */
+    void blockMatMatSolve(KSP ksp, Mat mat0, Mat &mat1);
+
+    /** \brief
+     * Converts a 2x2 nested matrix to a MATAIJ matrix (thus not nested).
+     *
+     * \param[in]  matNest  nested input matrix
+     * \param[out] mat      matrix of type MATAIJ, created inside this function.
+     */
+    void matNestConvert(Mat matNest, Mat &mat);
   }
 
 }
diff --git a/AMDiS/src/parallel/PetscSolverFeti.cc b/AMDiS/src/parallel/PetscSolverFeti.cc
index 95bb9eb8e6f81d881a80841d802ef73be837c45e..37c8fed40aa7a8ac666cd593bdb75743bd184859 100644
--- a/AMDiS/src/parallel/PetscSolverFeti.cc
+++ b/AMDiS/src/parallel/PetscSolverFeti.cc
@@ -794,6 +794,7 @@ namespace AMDiS {
 			     (void(*)(void))petscMultMatSchurPrimal);	
       } else {
 	schurPrimalAugmentedData.subSolver = subdomain;
+	schurPrimalAugmentedData.nestedVec = true;
 
 	localDofMap.createVec(schurPrimalAugmentedData.tmp_vec_b0, nGlobalOverallInterior);
 	localDofMap.createVec(schurPrimalAugmentedData.tmp_vec_b1, nGlobalOverallInterior);
@@ -824,63 +825,16 @@ namespace AMDiS {
 
       double wtime = MPI::Wtime();
 
-      TEST_EXIT(!augmentedLagrange)("Not yet supported!\n");
       TEST_EXIT_DBG(meshLevel == 0)
 	("Does not support for multilevel, check usage of localDofMap.\n");
 
 
-      // === First, calculate matK = inv(A_BB) A_BPi:                      ===
-      // ===    - get all local column vectors from A_BPi                  ===
-      // ===    - solve with A_BB for this column vector as the rhs vector ===
-      // ===    - set the result to the corresponding column vector of     ===
-      // ===      matrix matK                                              ===
+      // === Create explicit matrix representation of the Schur primal system. ===
 
-      int nRowsRankPrimal = primalDofMap.getRankDofs();
-      int nRowsOverallPrimal = primalDofMap.getOverallDofs();
-      int nRowsRankB = localDofMap.getRankDofs();
-
-      // Transform matrix A_BPi into a sparse column format.
-      petsc_helper::PetscMatCol mat_b_primal_cols;
-      petsc_helper::getMatLocalColumn(subdomain->getMatInteriorCoarse(), 
-				      mat_b_primal_cols);
-
-      TEST_EXIT(static_cast<int>(mat_b_primal_cols.size()) == 
-		primalDofMap.getLocalDofs())
-	("Should not happen!\n");
-
-      Vec tmpVec;
-      VecCreateSeq(PETSC_COMM_SELF, nRowsRankB, &tmpVec);
-
-      Mat matK;
-      MatCreateAIJ(mpiCommGlobal,
-		   nRowsRankB, nRowsRankPrimal, 
-		   nGlobalOverallInterior, nRowsOverallPrimal,
-		   150, PETSC_NULL, 150, PETSC_NULL, &matK);
-      MatSetOption(matK, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
-
-      // Solve for all column vectors of mat A_BPi and create matrix matK
-      for (petsc_helper::PetscMatCol::iterator it = mat_b_primal_cols.begin();
-	   it != mat_b_primal_cols.end(); ++it) {
-	petsc_helper::getColumnVec(it->second, tmpVec);
-	subdomain->solve(tmpVec, tmpVec);
-	petsc_helper::setMatLocalColumn(matK, it->first, tmpVec);
-      }
-
-      VecDestroy(&tmpVec);
-
-      MatAssemblyBegin(matK, MAT_FINAL_ASSEMBLY);
-      MatAssemblyEnd(matK, MAT_FINAL_ASSEMBLY);
-
-      // Calculate: mat_schur_primal = A_PiPi - A_PiB inv(A_BB) ABPi
-      //                             = A_PiPi - A_PiB matK
-      MatDuplicate(subdomain->getMatCoarse(), MAT_COPY_VALUES, 
-		   &mat_schur_primal);
-      Mat matPrimal;
-      MatMatMult(subdomain->getMatCoarseInterior(), matK, MAT_INITIAL_MATRIX, 
-		 PETSC_DEFAULT, &matPrimal);
-      MatAXPY(mat_schur_primal, -1.0, matPrimal, DIFFERENT_NONZERO_PATTERN);
-      MatDestroy(&matPrimal);
-      MatDestroy(&matK);
+      if (!augmentedLagrange)
+	createMatExplicitSchurPrimal();
+      else
+	createMatExplicitAugmentedSchurPrimal();
 
 
       // === Create KSP solver object and set appropriate solver options. ====
@@ -919,6 +873,160 @@ namespace AMDiS {
   }
 
 
+  void PetscSolverFeti::createMatExplicitSchurPrimal()
+  {
+    FUNCNAME("PetscSolverFeti::createMatExplicitSchurPrimal()");
+
+    int creationMode = 0;
+    Parameters::get("parallel->feti->schur primal creation mode", creationMode);
+    if (creationMode == 0) {
+      // matK = inv(A_BB) A_BPi
+      Mat matK;
+      petsc_helper::blockMatMatSolve(subdomain->getSolver(), 
+				     subdomain->getMatInteriorCoarse(),
+				     matK);
+      
+      
+      // mat_schur_primal = A_PiPi - A_PiB inv(A_BB) A_BPi
+      //                  = A_PiPi - A_PiB matK
+      MatMatMult(subdomain->getMatCoarseInterior(), matK, MAT_INITIAL_MATRIX, 
+		 PETSC_DEFAULT, &mat_schur_primal);
+      MatAYPX(mat_schur_primal, -1.0, subdomain->getMatCoarse(), DIFFERENT_NONZERO_PATTERN);
+      
+      MatDestroy(&matK);
+    } else {
+      Mat tmp;
+      
+      schurPrimalData.subSolver = subdomain;
+      
+      localDofMap.createVec(schurPrimalData.tmp_vec_b, nGlobalOverallInterior);
+      primalDofMap.createVec(schurPrimalData.tmp_vec_primal);
+      
+      MatCreateShell(mpiCommGlobal,
+		     primalDofMap.getRankDofs(), 
+		     primalDofMap.getRankDofs(), 
+		     primalDofMap.getOverallDofs(), 
+		     primalDofMap.getOverallDofs(),
+		     &schurPrimalData, 
+		     &tmp);
+      MatShellSetOperation(tmp, MATOP_MULT, 
+			   (void(*)(void))petscMultMatSchurPrimal);
+      MatComputeExplicitOperator(tmp, &mat_schur_primal);
+
+      MatDestroy(&tmp);
+      schurPrimalData.subSolver = NULL;
+      VecDestroy(&schurPrimalData.tmp_vec_b);
+      VecDestroy(&schurPrimalData.tmp_vec_primal);
+    }
+  }
+  
+  
+  void PetscSolverFeti::createMatExplicitAugmentedSchurPrimal()
+  {
+    FUNCNAME("PetscSolverFeti::createMatExplicitAugmentedSchurPrimal()");
+
+    int creationMode = 1;
+    Parameters::get("parallel->feti->schur primal creation mode", creationMode);
+    if (creationMode == 0) {
+      // qj = -Q J
+      Mat qj;
+      MatMatMult(mat_augmented_lagrange, mat_lagrange, MAT_INITIAL_MATRIX,
+		 PETSC_DEFAULT, &qj);
+      MatScale(qj, -1.0);
+      
+      
+      // matTmp = inv(A_BB) A_BPi
+      Mat matTmp;
+      petsc_helper::blockMatMatSolve(subdomain->getSolver(), 
+				     subdomain->getMatInteriorCoarse(),
+				     matTmp);
+      
+      
+      // mat00 = A_PiPi - A_PiB inv(A_BB) A_BPi
+      //       = A_PiPi - A_PiB matTmp
+      Mat mat00;
+      MatMatMult(subdomain->getMatCoarseInterior(), matTmp, MAT_INITIAL_MATRIX, 
+		 PETSC_DEFAULT, &mat00);
+      MatAYPX(mat00, -1.0, subdomain->getMatCoarse(), DIFFERENT_NONZERO_PATTERN);
+      
+      
+      // mat10 = -Q J inv(A_BB) A_BPi
+      //       = qj matTmp
+      Mat mat10;
+      MatMatMult(qj, matTmp, MAT_INITIAL_MATRIX, PETSC_DEFAULT, &mat10); 
+      
+      
+      // matTmp = inv(A_BB) trans(J) trans(Q)
+      Mat qT, jTqT;
+      MatTranspose(mat_augmented_lagrange, MAT_INITIAL_MATRIX, &qT);
+      MatTransposeMatMult(mat_lagrange, qT, MAT_INITIAL_MATRIX, PETSC_DEFAULT, 
+			  &jTqT);
+      petsc_helper::blockMatMatSolve(subdomain->getSolver(), jTqT, matTmp);
+      MatDestroy(&qT);
+      MatDestroy(&jTqT);
+      
+      // mat01 = -A_PiB inv(A_BB) trans(J) trans(Q)
+      //       = -A_PiB matTmp
+      Mat mat01;
+      MatMatMult(subdomain->getMatCoarseInterior(), matTmp, MAT_INITIAL_MATRIX, 
+		 PETSC_DEFAULT, &mat01);
+      MatScale(mat01, -1.0);
+      
+      // mat11 = -Q J inv(A_BB) trans(J) trans(Q)
+      //       = qj matTmp
+      Mat mat11;
+      MatMatMult(qj, matTmp, MAT_INITIAL_MATRIX, PETSC_DEFAULT, &mat11);
+      
+      MatDestroy(&matTmp);
+      MatDestroy(&qj);
+      
+      Mat nestMat[4] = {mat00, mat01, mat10, mat11};
+      MatCreateNest(PETSC_COMM_WORLD, 2, PETSC_NULL, 2, PETSC_NULL, nestMat, &matTmp);
+      petsc_helper::matNestConvert(matTmp, mat_schur_primal);
+      
+      MatDestroy(&mat00);
+      MatDestroy(&mat01);
+      MatDestroy(&mat10);
+      MatDestroy(&mat11);
+      MatDestroy(&matTmp);
+    } else {
+      Mat tmp;
+
+      schurPrimalAugmentedData.subSolver = subdomain;
+      schurPrimalAugmentedData.nestedVec = false;
+
+      localDofMap.createVec(schurPrimalAugmentedData.tmp_vec_b0, nGlobalOverallInterior);
+      localDofMap.createVec(schurPrimalAugmentedData.tmp_vec_b1, nGlobalOverallInterior);
+      primalDofMap.createVec(schurPrimalAugmentedData.tmp_vec_primal);
+      lagrangeMap.createVec(schurPrimalAugmentedData.tmp_vec_lagrange);
+
+      schurPrimalAugmentedData.mat_lagrange = &mat_lagrange;
+      schurPrimalAugmentedData.mat_augmented_lagrange = &mat_augmented_lagrange;
+      
+      MatCreateShell(mpiCommGlobal,
+		     primalDofMap.getRankDofs() + nRankEdges, 
+		     primalDofMap.getRankDofs() + nRankEdges, 
+		     primalDofMap.getOverallDofs() + nOverallEdges, 
+		     primalDofMap.getOverallDofs() + nOverallEdges,
+		     &schurPrimalAugmentedData, 
+		     &tmp);
+      MatShellSetOperation(tmp, MATOP_MULT, 
+			   (void(*)(void))petscMultMatSchurPrimalAugmented);
+
+      MatComputeExplicitOperator(tmp, &mat_schur_primal);
+
+      MatDestroy(&tmp);
+      schurPrimalAugmentedData.subSolver = NULL;
+      schurPrimalAugmentedData.mat_lagrange = NULL;
+      schurPrimalAugmentedData.mat_augmented_lagrange = NULL;
+      VecDestroy(&schurPrimalAugmentedData.tmp_vec_b0);
+      VecDestroy(&schurPrimalAugmentedData.tmp_vec_b1);
+      VecDestroy(&schurPrimalAugmentedData.tmp_vec_primal);
+      VecDestroy(&schurPrimalAugmentedData.tmp_vec_lagrange);
+    }
+  }
+
+
   void PetscSolverFeti::destroySchurPrimalKsp()
   {
     FUNCNAME("PetscSolverFeti::destroySchurPrimal()");
diff --git a/AMDiS/src/parallel/PetscSolverFeti.h b/AMDiS/src/parallel/PetscSolverFeti.h
index d30f5d5d41445950985a1b952f51c139d0535f28..0d73abb4d0ee10c10f3830d53140229db2765b72 100644
--- a/AMDiS/src/parallel/PetscSolverFeti.h
+++ b/AMDiS/src/parallel/PetscSolverFeti.h
@@ -140,6 +140,12 @@ namespace AMDiS {
     /// system on the primal variables, \ref ksp_schur_primal
     void createSchurPrimalKsp(vector<const FiniteElemSpace*> &feSpaces);
 
+    ///
+    void createMatExplicitSchurPrimal();
+    
+    ///
+    void createMatExplicitAugmentedSchurPrimal();
+
     /// Destroys PETSc KSP solver object \ref ksp_schur_primal
     void destroySchurPrimalKsp();
 
diff --git a/AMDiS/src/parallel/PetscSolverFetiOperators.cc b/AMDiS/src/parallel/PetscSolverFetiOperators.cc
index 88275a08577c5cd3e1327730b9e4e5242087c162..cd9c94c38a204d6490149ad095f30b4a3ae6af33 100644
--- a/AMDiS/src/parallel/PetscSolverFetiOperators.cc
+++ b/AMDiS/src/parallel/PetscSolverFetiOperators.cc
@@ -41,11 +41,50 @@ namespace AMDiS {
     SchurPrimalAugmentedData* data = 
       static_cast<SchurPrimalAugmentedData*>(ctx);
 
-    Vec x_primal, x_mu, y_primal, y_mu;    
-    VecNestGetSubVec(x, 0, &x_primal);
-    VecNestGetSubVec(x, 1, &x_mu);
-    VecNestGetSubVec(y, 0, &y_primal);
-    VecNestGetSubVec(y, 1, &y_mu);
+    Vec x_primal, x_mu, y_primal, y_mu;
+    
+    if (data->nestedVec) {
+      VecNestGetSubVec(x, 0, &x_primal);
+      VecNestGetSubVec(x, 1, &x_mu);
+      VecNestGetSubVec(y, 0, &y_primal);
+      VecNestGetSubVec(y, 1, &y_mu);
+    } else {
+      VecDuplicate(data->tmp_vec_primal, &x_primal);
+      VecDuplicate(data->tmp_vec_primal, &y_primal);
+
+      PetscInt allLocalSize, allSize;
+      VecGetLocalSize(x, &allLocalSize);
+      VecGetSize(x, &allSize);
+
+      PetscInt primalLocalSize, primalSize;
+      VecGetLocalSize(x_primal, &primalLocalSize);
+      VecGetSize(x_primal, &primalSize);
+
+      TEST_EXIT_DBG(allSize > primalSize)("Should not happen!\n");
+      TEST_EXIT_DBG(allLocalSize >= primalLocalSize)("Should not happen!\n");
+
+      PetscInt muLocalSize = allLocalSize - primalLocalSize;
+      PetscInt muSize = allSize - primalSize;
+
+      VecCreateMPI(PETSC_COMM_WORLD, muLocalSize, muSize, &x_mu);
+      VecCreateMPI(PETSC_COMM_WORLD, muLocalSize, muSize, &y_mu);
+
+      PetscScalar *allValue;
+      PetscScalar *primalValue;
+      PetscScalar *muValue;
+      VecGetArray(x, &allValue);
+      VecGetArray(x_primal, &primalValue);
+      VecGetArray(x_mu, &muValue);
+
+      for (int i = 0; i < primalLocalSize; i++)
+	primalValue[i] = allValue[i];
+      for (int i = 0; i < muLocalSize; i++)
+	muValue[i] = allValue[primalLocalSize + i];
+
+      VecRestoreArray(x, &allValue);
+      VecRestoreArray(x_primal, &primalValue);
+      VecRestoreArray(x_mu, &muValue);
+    }
 
     // inv(K_BB) K_BPi x_Pi
     MatMult(data->subSolver->getMatInteriorCoarse(), x_primal, data->tmp_vec_b0);
@@ -74,6 +113,36 @@ namespace AMDiS {
     MatMult(*(data->mat_augmented_lagrange), data->tmp_vec_lagrange, y_mu);
     VecScale(y_mu, -1.0);
 
+    if (!data->nestedVec) {
+      PetscInt allLocalSize;
+      VecGetLocalSize(x, &allLocalSize);
+      PetscInt primalLocalSize;
+      VecGetLocalSize(x_primal, &primalLocalSize);
+      PetscInt muLocalSize = allLocalSize - primalLocalSize;
+
+      PetscScalar *allValue;
+      PetscScalar *primalValue;
+      PetscScalar *muValue;
+      VecGetArray(y, &allValue);
+      VecGetArray(y_primal, &primalValue);
+      VecGetArray(y_mu, &muValue);
+
+      for (int i = 0; i < primalLocalSize; i++)
+	allValue[i] = primalValue[i];
+
+      for (int i = 0; i < muLocalSize; i++)
+	allValue[primalLocalSize + i] = muValue[i];
+
+      VecRestoreArray(y, &allValue);
+      VecRestoreArray(y_primal, &primalValue);
+      VecRestoreArray(y_mu, &muValue);
+
+      VecDestroy(&x_primal);
+      VecDestroy(&y_primal);
+      VecDestroy(&x_mu);
+      VecDestroy(&y_mu);
+    }
+
     return 0;
   }
 
diff --git a/AMDiS/src/parallel/PetscSolverFetiStructs.h b/AMDiS/src/parallel/PetscSolverFetiStructs.h
index 11eafe4e5a219c2deabb6650b1760538385ca938..8fa97340ebbf310d156b691c365cbba3e1659183 100644
--- a/AMDiS/src/parallel/PetscSolverFetiStructs.h
+++ b/AMDiS/src/parallel/PetscSolverFetiStructs.h
@@ -63,6 +63,8 @@ namespace AMDiS {
     Mat *mat_augmented_lagrange;
 
     PetscSolver* subSolver;
+
+    bool nestedVec;
   };