Skip to content

Commit

Permalink
Intrepid2: Support Full Exact Sequence in Structured Integration (tri…
Browse files Browse the repository at this point in the history
…linos#10419)

This PR extends previous work in "Structured Integration", a term that generalizes sum factorization algorithms for finite element assembly. The main thing that was missing for full support of the exact sequence was pullbacks in FunctionSpaceTools for spaces other than H(grad). This PR adds those, as well as implementations of some sample formulations -- corresponding to the natural norms in each function space -- in both unit tests and performance tests. There are some changes to the new data structures as well as to the new integration kernels that were required for this full support.

StructuredIntegrationTests has gotten an upgrade; for the assembly comparison tests (of which there are now five distinct formulations assembled: Poisson plus the natural norms for each space), we now use test templates. This simplifies declarations, makes it easier to see what cases are covered, and eliminates a fair amount of code redundancy.
  • Loading branch information
CamelliaDPG authored Apr 23, 2022
1 parent 909f797 commit 13b95f6
Show file tree
Hide file tree
Showing 36 changed files with 4,497 additions and 1,481 deletions.
151 changes: 151 additions & 0 deletions packages/intrepid2/assembly-examples/GRADGRADStandardAssembly.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
//
// GRADGRADStandardAssembly.hpp
// Trilinos
//
// Created by Roberts, Nathan V on 7/2/21.
//

#ifndef Intrepid2_GRADGRADStandardAssembly_hpp
#define Intrepid2_GRADGRADStandardAssembly_hpp

#include "JacobianFlopEstimate.hpp"

/** \file GRADGRADStandardAssembly.hpp
\brief Locally assembles a Poisson matrix -- an array of shape (C,F,F), with formulation (grad e_i, grad e_j), using standard Intrepid2 methods; these do not algorithmically exploit geometric structure.
*/

//! Version that uses the classic, generic Intrepid2 paths.
template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureGRADGRAD(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry,
const int &polyOrder, int worksetSize,
double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
{
using ExecutionSpace = typename DeviceType::execution_space;
int numVertices = 1;
for (int d=0; d<spaceDim; d++)
{
numVertices *= 2;
}

auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
initialSetupTimer->start();

using CellTools = Intrepid2::CellTools<DeviceType>;
using FunctionSpaceTools = Intrepid2::FunctionSpaceTools<DeviceType>;

using namespace Intrepid2;

using namespace std;
// dimensions of the returned view are (C,F,F)
auto fs = FUNCTION_SPACE_HGRAD;

shards::CellTopology cellTopo = geometry.cellTopology();

auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);

int numFields = basis->getCardinality();
int numCells = geometry.numCells();

if (worksetSize > numCells) worksetSize = numCells;

// local stiffness matrices:
ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);

auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
int numPoints = cubature->getNumPoints();
ScalarView<PointScalar,DeviceType> cubaturePoints("cubature points",numPoints,spaceDim);
ScalarView<double,DeviceType> cubatureWeights("cubature weights", numPoints);

cubature->getCubature(cubaturePoints, cubatureWeights);

const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices);
const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);

// Allocate some intermediate containers
ScalarView<Scalar,DeviceType> basisValues ("basis values", numFields, numPoints );
ScalarView<Scalar,DeviceType> basisGradValues("basis grad values", numFields, numPoints, spaceDim);

ScalarView<Scalar,DeviceType> transformedGradValues("transformed grad values", worksetSize, numFields, numPoints, spaceDim);
ScalarView<Scalar,DeviceType> transformedWeightedGradValues("transformed weighted grad values", worksetSize, numFields, numPoints, spaceDim);

basis->getValues(basisValues, cubaturePoints, OPERATOR_VALUE );
basis->getValues(basisGradValues, cubaturePoints, OPERATOR_GRAD );

const int numNodesPerCell = geometry.numNodesPerCell();
ScalarView<PointScalar,DeviceType> expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim);
Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0,numCells),
KOKKOS_LAMBDA (const int &cellOrdinal) {
for (int nodeOrdinal=0; nodeOrdinal<numNodesPerCell; nodeOrdinal++)
{
for (int d=0; d<spaceDim; d++)
{
expandedCellNodes(cellOrdinal,nodeOrdinal,d) = geometry(cellOrdinal,nodeOrdinal,d);
}
}
});

ScalarView<Scalar,DeviceType> cellMeasures("cell measures", worksetSize, numPoints);
ScalarView<Scalar,DeviceType> jacobianDeterminant("jacobian determinant", worksetSize, numPoints);
ScalarView<Scalar,DeviceType> jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim);
ScalarView<Scalar,DeviceType> jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim);

initialSetupTimer->stop();

transformIntegrateFlopCount = 0;
jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself
jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)

int cellOffset = 0;
while (cellOffset < numCells)
{
int startCell = cellOffset;
int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;

std::pair<int,int> cellRange = {startCell, startCell+numCellsInWorkset};
auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL());

if (numCellsInWorkset != worksetSize)
{
Kokkos::resize(jacobian, numCellsInWorkset, numPoints, spaceDim, spaceDim);
Kokkos::resize(jacobianInverse, numCellsInWorkset, numPoints, spaceDim, spaceDim);
Kokkos::resize(jacobianDeterminant, numCellsInWorkset, numPoints);
Kokkos::resize(cellMeasures, numCellsInWorkset, numPoints);
Kokkos::resize(transformedGradValues, numCellsInWorkset, numFields, numPoints, spaceDim);
Kokkos::resize(transformedWeightedGradValues, numCellsInWorkset, numFields, numPoints, spaceDim);
}
jacobianAndCellMeasureTimer->start();
CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); // accounted for outside loop, as numCells * flopsPerJacobianPerCell.
CellTools::setJacobianInv(jacobianInverse, jacobian);
CellTools::setJacobianDet(jacobianDeterminant, jacobian);

FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights);
ExecutionSpace().fence();
jacobianAndCellMeasureTimer->stop();

// because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls.
fstIntegrateCall->start();
FunctionSpaceTools::HGRADtransformGRAD(transformedGradValues, jacobianInverse, basisGradValues);
transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction.
FunctionSpaceTools::multiplyMeasure(transformedWeightedGradValues, cellMeasures, transformedGradValues);
transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedGradValues: one flop for each.

auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());

FunctionSpaceTools::integrate(cellStiffnessSubview, transformedGradValues, transformedWeightedGradValues);
ExecutionSpace().fence();
fstIntegrateCall->stop();

transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * spaceDim * 2 - 1); // 2: one multiply, one add per (P,D) entry in the contraction.

cellOffset += worksetSize;
}
// std::cout << "standard integration, approximateFlopCount: " << approximateFlopCount << std::endl;
return cellStiffness;
}

#endif /* GRADGRADStandardAssembly_h */
153 changes: 153 additions & 0 deletions packages/intrepid2/assembly-examples/GRADGRADStructuredAssembly.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
//
// GRADGRADStructuredAssembly.hpp
// Trilinos
//
// Created by Roberts, Nathan V on 7/2/21.
//

#ifndef GRADGRADStructuredAssembly_h
#define GRADGRADStructuredAssembly_h

#include "JacobianFlopEstimate.hpp"

/** \file GRADGRADStructuredAssembly.hpp
\brief Locally assembles a Poisson matrix -- an array of shape (C,F,F), with formulation (grad e_i, grad e_j), using "structured" Intrepid2 methods; these algorithmically exploit geometric structure as expressed in the provided CellGeometry.
*/

//! Version that takes advantage of new structured integration support, including sum factorization.
template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
Intrepid2::ScalarView<Scalar,DeviceType> performStructuredQuadratureGRADGRAD(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
{
using namespace Intrepid2;

using ExecutionSpace = typename DeviceType::execution_space;

int numVertices = 1;
for (int d=0; d<spaceDim; d++)
{
numVertices *= 2;
}

auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
initialSetupTimer->start();
using namespace std;
using FunctionSpaceTools = FunctionSpaceTools<DeviceType>;
using IntegrationTools = IntegrationTools<DeviceType>;
// dimensions of the returned view are (C,F,F)
auto fs = FUNCTION_SPACE_HGRAD;

shards::CellTopology cellTopo = geometry.cellTopology();

auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);

int numFields = basis->getCardinality();
int numCells = geometry.numCells();

// local stiffness matrix:
ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);

auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
auto tensorCubatureWeights = cubature->allocateCubatureWeights();
TensorPoints<PointScalar,DeviceType> tensorCubaturePoints = cubature->allocateCubaturePoints();

cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights);

EOperator op = OPERATOR_GRAD;
BasisValues<Scalar,DeviceType> gradientValues = basis->allocateBasisValues(tensorCubaturePoints, op);
basis->getValues(gradientValues, tensorCubaturePoints, op);

// goal here is to do a weighted Poisson; i.e. (f grad u, grad v) on each cell

int cellOffset = 0;

auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");

Data<PointScalar,DeviceType> jacobian = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize);
Data<PointScalar,DeviceType> jacobianDet = CellTools<DeviceType>::allocateJacobianDet(jacobian);
Data<PointScalar,DeviceType> jacobianInv = CellTools<DeviceType>::allocateJacobianInv(jacobian);
TensorData<PointScalar,DeviceType> cellMeasures = geometry.allocateCellMeasure(jacobianDet, tensorCubatureWeights);

// lazily-evaluated transformed gradient values (temporary to allow integralData allocation)
auto transformedGradientValuesTemp = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues);
auto integralData = IntegrationTools::allocateIntegralData(transformedGradientValuesTemp, cellMeasures, transformedGradientValuesTemp);

const int numPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases

// TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount)
const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices);
const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);

transformIntegrateFlopCount = 0;
jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself
jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)

auto refData = geometry.getJacobianRefData(tensorCubaturePoints);

initialSetupTimer->stop();
while (cellOffset < numCells)
{
int startCell = cellOffset;
int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
int endCell = numCellsInWorkset + startCell;

jacobianAndCellMeasureTimer->start();
if (numCellsInWorkset != worksetSize)
{
const int CELL_DIM = 0; // first dimension corresponds to cell
jacobian.setExtent( CELL_DIM, numCellsInWorkset);
jacobianDet.setExtent( CELL_DIM, numCellsInWorkset);
jacobianInv.setExtent( CELL_DIM, numCellsInWorkset);
integralData.setExtent(CELL_DIM, numCellsInWorkset);

// cellMeasures is a TensorData object with separateFirstComponent_ = true; the below sets the cell dimension…
cellMeasures.setFirstComponentExtentInDimension0(numCellsInWorkset);
}

geometry.setJacobian(jacobian, tensorCubaturePoints, refData, startCell, endCell);
CellTools<DeviceType>::setJacobianDet(jacobianDet, jacobian);
CellTools<DeviceType>::setJacobianInv(jacobianInv, jacobian);

// lazily-evaluated transformed gradient values:
auto transformedGradientValues = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues);

geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights);
ExecutionSpace().fence();
jacobianAndCellMeasureTimer->stop();

bool sumInto = false;
double approximateFlopCountIntegrateWorkset = 0;
fstIntegrateCall->start();
IntegrationTools::integrate(integralData, transformedGradientValues, cellMeasures, transformedGradientValues, sumInto, &approximateFlopCountIntegrateWorkset);
ExecutionSpace().fence();
fstIntegrateCall->stop();

// copy into cellStiffness container. (Alternately, do something like allocateIntegralData, but outside this loop, and take a subview to construct the workset integralData.)
if (integralData.getUnderlyingViewRank() == 3)
{
std::pair<int,int> cellRange = {startCell, endCell};
auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
Kokkos::deep_copy(cellStiffnessSubview, integralData.getUnderlyingView3());
}
else // underlying view rank is 2; copy to each cell in destination stiffness matrix
{
auto integralView2 = integralData.getUnderlyingView2();
auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>({0,0,0},{numCellsInWorkset,numFields,numFields});
Kokkos::parallel_for("copy uniform data to expanded container", policy,
KOKKOS_LAMBDA (const int &cellOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
cellStiffness(startCell + cellOrdinal, leftFieldOrdinal, rightFieldOrdinal) = integralView2(leftFieldOrdinal,rightFieldOrdinal);
});
}

transformIntegrateFlopCount += approximateFlopCountIntegrateWorkset;

cellOffset += worksetSize;
}
return cellStiffness;
}

#endif /* GRADGRADStructuredAssembly_h */
Loading

0 comments on commit 13b95f6

Please sign in to comment.