From ef52255eebd62f941135f5ed9c6b5ac8a182c02f Mon Sep 17 00:00:00 2001
From: Thomas Witkowski <thomas.witkowski@gmx.de>
Date: Wed, 1 Oct 2008 09:12:34 +0000
Subject: [PATCH] * Bugfixes for muli-threaded runs. Valgrind and Intel thread
 checker run now with 0 Errors!

---
 AMDiS/src/BasisFunction.cc             | 6 +++---
 AMDiS/src/BoundaryManager.cc           | 2 +-
 AMDiS/src/DOFVector.hh                 | 8 ++++----
 AMDiS/src/DirichletBC.cc               | 4 ++--
 AMDiS/src/OpenMP.h                     | 4 ++++
 AMDiS/src/Operator.cc                  | 2 +-
 AMDiS/src/Operator.hh                  | 6 +++---
 AMDiS/src/ProblemVec.cc                | 8 ++++++++
 AMDiS/src/ResidualParallelEstimator.cc | 4 ++--
 AMDiS/src/SecondOrderAssembler.cc      | 6 +++---
 AMDiS/src/SubAssembler.cc              | 2 +-
 AMDiS/src/TraverseParallel.cc          | 2 +-
 AMDiS/src/ZeroOrderAssembler.cc        | 6 +++---
 13 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/AMDiS/src/BasisFunction.cc b/AMDiS/src/BasisFunction.cc
index f64a0478..b71b642c 100644
--- a/AMDiS/src/BasisFunction.cc
+++ b/AMDiS/src/BasisFunction.cc
@@ -25,10 +25,10 @@ namespace AMDiS {
     nDOF = NEW DimVec<int>(dim, DEFAULT_VALUE, -1);
     dow = Global::getGeo(WORLD);
 
-    grdTmpVec1.resize(omp_get_max_threads());
-    grdTmpVec2.resize(omp_get_max_threads());
+    grdTmpVec1.resize(omp_get_num_procs());
+    grdTmpVec2.resize(omp_get_num_procs());
 
-    for (int i = 0; i < omp_get_max_threads(); i++) {
+    for (int i = 0; i < omp_get_num_procs(); i++) {
       grdTmpVec1[i] = NEW DimVec<double>(dim, DEFAULT_VALUE, 0.0);
       grdTmpVec2[i] = NEW DimVec<double>(dim, DEFAULT_VALUE, 0.0);
     }
diff --git a/AMDiS/src/BoundaryManager.cc b/AMDiS/src/BoundaryManager.cc
index 8ae905e8..416c08de 100644
--- a/AMDiS/src/BoundaryManager.cc
+++ b/AMDiS/src/BoundaryManager.cc
@@ -11,7 +11,7 @@ namespace AMDiS {
 
   BoundaryManager::BoundaryManager(const FiniteElemSpace *feSpace)
   {
-    localBounds.resize(omp_get_max_threads());
+    localBounds.resize(omp_get_num_procs());
     allocatedMemoryLocalBounds = feSpace->getBasisFcts()->getNumber();
     for (int i = 0; i < static_cast<int>(localBounds.size()); i++) {
       localBounds[i] = GET_MEMORY(BoundaryType, allocatedMemoryLocalBounds);
diff --git a/AMDiS/src/DOFVector.hh b/AMDiS/src/DOFVector.hh
index a8b4aa2c..f408e1e1 100644
--- a/AMDiS/src/DOFVector.hh
+++ b/AMDiS/src/DOFVector.hh
@@ -31,11 +31,11 @@ namespace AMDiS {
     nBasFcts = feSpace->getBasisFcts()->getNumber();
     int dim = feSpace->getMesh()->getDim();
 
-    localIndices.resize(omp_get_max_threads());
-    grdPhis.resize(omp_get_max_threads());
-    D2Phis.resize(omp_get_max_threads());
+    localIndices.resize(omp_get_num_procs());
+    grdPhis.resize(omp_get_num_procs());
+    D2Phis.resize(omp_get_num_procs());
 
-    for (int i = 0; i < omp_get_max_threads(); i++) {
+    for (int i = 0; i < omp_get_num_procs(); i++) {
       localIndices[i] = GET_MEMORY(DegreeOfFreedom, this->nBasFcts);      
       grdPhis[i] = NEW DimVec<double>(dim, DEFAULT_VALUE, 0.0);
       D2Phis[i] = NEW DimMat<double>(dim, NO_INIT);
diff --git a/AMDiS/src/DirichletBC.cc b/AMDiS/src/DirichletBC.cc
index 0cbd6ff4..cd67b70d 100644
--- a/AMDiS/src/DirichletBC.cc
+++ b/AMDiS/src/DirichletBC.cc
@@ -14,7 +14,7 @@ namespace AMDiS {
       f(fct), 
       dofVec(NULL)
   {
-    worldCoords.resize(omp_get_max_threads());
+    worldCoords.resize(omp_get_num_procs());
   };
 
   DirichletBC::DirichletBC(BoundaryType type,
@@ -23,7 +23,7 @@ namespace AMDiS {
       f(NULL), 
       dofVec(vec)
   {
-    worldCoords.resize(omp_get_max_threads());
+    worldCoords.resize(omp_get_num_procs());
   }
 
   void DirichletBC::fillBoundaryCondition(DOFMatrix* matrix,
diff --git a/AMDiS/src/OpenMP.h b/AMDiS/src/OpenMP.h
index 63cd1473..37d351d2 100644
--- a/AMDiS/src/OpenMP.h
+++ b/AMDiS/src/OpenMP.h
@@ -15,6 +15,10 @@ inline int omp_get_max_threads() {
   return 1;
 }
 
+inline int omp_get_num_procs() {
+  return 1;
+}
+
 inline int omp_get_num_threads() {
   return 1;
 }
diff --git a/AMDiS/src/Operator.cc b/AMDiS/src/Operator.cc
index 5141e7eb..cedec770 100644
--- a/AMDiS/src/Operator.cc
+++ b/AMDiS/src/Operator.cc
@@ -180,7 +180,7 @@ namespace AMDiS {
       uhOld(NULL),
       optimized(true)
   {
-    int maxThreads = omp_get_max_threads();
+    int maxThreads = omp_get_num_procs();
 
     assembler.resize(maxThreads);
     secondOrder.resize(maxThreads);
diff --git a/AMDiS/src/Operator.hh b/AMDiS/src/Operator.hh
index 7e9c5b11..d0508f8c 100644
--- a/AMDiS/src/Operator.hh
+++ b/AMDiS/src/Operator.hh
@@ -6,7 +6,7 @@ namespace AMDiS {
     secondOrder[0].push_back(term);
     term->operat = this;
 
-    for (int i = 1; i < omp_get_max_threads(); i++) {
+    for (int i = 1; i < omp_get_num_procs(); i++) {
       T *newTerm = NEW T(static_cast<const T>(*term));
       secondOrder[i].push_back(newTerm);
     }
@@ -23,7 +23,7 @@ namespace AMDiS {
     }
     term->operat = this;
 
-    for (int i = 1; i < omp_get_max_threads(); i++) {
+    for (int i = 1; i < omp_get_num_procs(); i++) {
       T *newTerm = NEW T(static_cast<const T>(*term));
       if (type == GRD_PSI) {
 	firstOrderGrdPsi[i].push_back(newTerm);
@@ -39,7 +39,7 @@ namespace AMDiS {
     zeroOrder[0].push_back(term);
     term->operat = this;
 
-    for (int i = 1; i < omp_get_max_threads(); i++) {
+    for (int i = 1; i < omp_get_num_procs(); i++) {
       T *newTerm = NEW T(static_cast<const T>(*term));
       zeroOrder[i].push_back(newTerm);
     }
diff --git a/AMDiS/src/ProblemVec.cc b/AMDiS/src/ProblemVec.cc
index 76d4b385..532c6590 100644
--- a/AMDiS/src/ProblemVec.cc
+++ b/AMDiS/src/ProblemVec.cc
@@ -943,6 +943,14 @@ namespace AMDiS {
 	tmpVector->set(0.0);
       }
 
+      // After creating privat copies of the DOFMatrix and the DOFVector, all threads
+      // have to wait at this barrier. Especially for small problems this is required,
+      // because otherwise one thread may be finished with assembling, before another
+      // has made his private copy.
+#ifdef _OPENMP
+#pragma omp barrier
+#endif
+
       // Because we are using the parallel traverse stack, each thread will
       // traverse only a part of the mesh.
       ElInfo *elInfo = stack.traverseFirst(mesh, -1, assembleFlag);
diff --git a/AMDiS/src/ResidualParallelEstimator.cc b/AMDiS/src/ResidualParallelEstimator.cc
index a6b285d8..5176a789 100644
--- a/AMDiS/src/ResidualParallelEstimator.cc
+++ b/AMDiS/src/ResidualParallelEstimator.cc
@@ -14,9 +14,9 @@ namespace AMDiS {
     GET_PARAMETER(0, name + "->C3", "%f", &C3);
     C3 = C3 > 1.e-25 ? sqr(C3) : 0.0;
 
-    seqEstimators_.resize(omp_get_max_threads());
+    seqEstimators_.resize(omp_get_num_procs());
 
-    for (int i = 0; i < omp_get_max_threads(); i++) {
+    for (int i = 0; i < omp_get_num_procs(); i++) {
       seqEstimators_[i] = NEW ResidualEstimator(name, r);
     }
   }
diff --git a/AMDiS/src/SecondOrderAssembler.cc b/AMDiS/src/SecondOrderAssembler.cc
index bd1dc76e..181ef0ca 100644
--- a/AMDiS/src/SecondOrderAssembler.cc
+++ b/AMDiS/src/SecondOrderAssembler.cc
@@ -88,8 +88,8 @@ namespace AMDiS {
     q11 = Q11PsiPhi::provideQ11PsiPhi(owner->getRowFESpace()->getBasisFcts(), 
 				      owner->getColFESpace()->getBasisFcts(), 
 				      quadrature);
-    tmpLALt.resize(omp_get_max_threads());
-    for (int i = 0; i < omp_get_max_threads(); i++) {
+    tmpLALt.resize(omp_get_num_procs());
+    for (int i = 0; i < omp_get_num_procs(); i++) {
       tmpLALt[i] = NEW DimMat<double>*;
       *(tmpLALt[i]) = NEW DimMat<double>(dim, NO_INIT);
     }
@@ -163,7 +163,7 @@ namespace AMDiS {
   Quad2::Quad2(Operator *op, Assembler *assembler, Quadrature *quad) 
     : SecondOrderAssembler(op, assembler, quad, true)
   {
-    tmpLALt.resize(omp_get_max_threads());
+    tmpLALt.resize(omp_get_num_procs());
   }
 
   Quad2::~Quad2()
diff --git a/AMDiS/src/SubAssembler.cc b/AMDiS/src/SubAssembler.cc
index 0414f82c..3f8e7fd7 100644
--- a/AMDiS/src/SubAssembler.cc
+++ b/AMDiS/src/SubAssembler.cc
@@ -34,7 +34,7 @@ namespace AMDiS {
     nRow = psi->getNumber();
     nCol = phi->getNumber();
 
-    int maxThreads = omp_get_max_threads();
+    int maxThreads = omp_get_num_procs();
     terms.resize(maxThreads);
 
     switch (order) {
diff --git a/AMDiS/src/TraverseParallel.cc b/AMDiS/src/TraverseParallel.cc
index bd770b46..60a627da 100644
--- a/AMDiS/src/TraverseParallel.cc
+++ b/AMDiS/src/TraverseParallel.cc
@@ -9,7 +9,7 @@ namespace AMDiS {
   TraverseParallelStack::TraverseParallelStack(int nThreads)
   {
     if (nThreads == 0) {
-      nThreads_ = omp_get_max_threads();
+      nThreads_ = omp_get_num_procs();
     } else {
       nThreads_ = nThreads;
     }
diff --git a/AMDiS/src/ZeroOrderAssembler.cc b/AMDiS/src/ZeroOrderAssembler.cc
index d9df629e..5e2b1535 100644
--- a/AMDiS/src/ZeroOrderAssembler.cc
+++ b/AMDiS/src/ZeroOrderAssembler.cc
@@ -39,7 +39,7 @@ namespace AMDiS {
       optimized ? &optimizedSubAssemblers : &standardSubAssemblers;
 
     int myRank = omp_get_thread_num();
-    std::vector<OperatorTerm*> opTerms  = op->zeroOrder[myRank];
+    std::vector<OperatorTerm*> opTerms = op->zeroOrder[myRank];
 
     sort(opTerms.begin(), opTerms.end());
 
@@ -227,12 +227,12 @@ namespace AMDiS {
   FastQuadZOA::FastQuadZOA(Operator *op, Assembler *assembler, Quadrature *quad)
     : ZeroOrderAssembler(op, assembler, quad, true)
   {
-    cPtrs.resize(omp_get_max_threads());
+    cPtrs.resize(omp_get_num_procs());
   }
 
   FastQuadZOA::~FastQuadZOA()
   {
-    for (int i = 0; i < omp_get_max_threads(); i++) {
+    for (int i = 0; i < omp_get_num_procs(); i++) {
       FREE_MEMORY(cPtrs[i], double, quadrature->getNumPoints());
     }
   }
-- 
GitLab