diff --git a/Project.toml b/Project.toml index e3c5eb3fbd..5b1a20788b 100644 --- a/Project.toml +++ b/Project.toml @@ -16,11 +16,14 @@ Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4" WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192" [weakdeps] +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b" [extensions] FerriteBlockArrays = "BlockArrays" +FerriteCuda = ["CUDA", "Adapt"] FerriteMetis = "Metis" [compat] @@ -34,6 +37,7 @@ Preferences = "1" Reexport = "1" StaticArrays = "1" Tensors = "1.14" +TimerOutputs = "0.5.25" WriteVTK = "1.13" julia = "1.10" diff --git a/docs/Manifest.toml b/docs/Manifest.toml index d627674895..1c748a4051 100644 --- a/docs/Manifest.toml +++ b/docs/Manifest.toml @@ -2,7 +2,7 @@ julia_version = "1.11.1" manifest_format = "2.0" -project_hash = "4e52f4aa4cee9f66ec4f633f0ae538fbd227ac5e" +project_hash = "f5e9904007b2fd3f4bda9fc8dd33eb629bc31d18" [[deps.ADTypes]] git-tree-sha1 = "eea5d80188827b35333801ef97a40c2ed653b081" @@ -19,6 +19,17 @@ git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c" uuid = "a4c015fc-c6ff-483c-b24f-f7ea428134e9" version = "0.0.1" +[[deps.AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.5.0" +weakdeps = ["ChainRulesCore", "Test"] + + [deps.AbstractFFTs.extensions] + AbstractFFTsChainRulesCoreExt = "ChainRulesCore" + AbstractFFTsTestExt = "Test" + [[deps.AbstractTrees]] git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177" uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" @@ -113,6 +124,18 @@ weakdeps = ["SparseArrays"] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" version = "1.11.0" +[[deps.Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" +version = "0.1.0" + +[[deps.BFloat16s]] +deps = ["LinearAlgebra", "Printf", "Random", "Test"] +git-tree-sha1 = "2c7cc21e8678eff479978a0a2ef5ce2f51b63dff" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.5.0" + [[deps.BangBang]] deps = ["Accessors", "ConstructionBase", "InitialValues", "LinearAlgebra", "Requires"] git-tree-sha1 = "e2144b631226d9eeab2d746ca8880b7ccff504ae" @@ -185,12 +208,47 @@ git-tree-sha1 = "9e2a6b69137e6969bab0152632dcb3bc108c8bdd" uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" version = "1.0.8+1" +[[deps.CEnum]] +git-tree-sha1 = "389ad5c84de1ae7cf0e28e381131c98ea87d54fc" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.5.0" + [[deps.CPUSummary]] deps = ["CpuId", "IfElse", "PrecompileTools", "Static"] git-tree-sha1 = "5a97e67919535d6841172016c9530fd69494e5ec" uuid = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9" version = "0.2.6" +[[deps.CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LLVMLoopInfo", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "StaticArrays", "Statistics", "demumble_jll"] +git-tree-sha1 = "e0725a467822697171af4dae15cec10b4fc19053" +uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" +version = "5.5.2" +weakdeps = ["ChainRulesCore", "EnzymeCore", "SpecialFunctions"] + + [deps.CUDA.extensions] + ChainRulesCoreExt = "ChainRulesCore" + EnzymeCoreExt = "EnzymeCore" + SpecialFunctionsExt = "SpecialFunctions" + +[[deps.CUDA_Driver_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "ccd1e54610c222fadfd4737dac66bff786f63656" +uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc" +version = "0.10.3+0" + +[[deps.CUDA_Runtime_Discovery]] +deps = ["Libdl"] +git-tree-sha1 = "33576c7c1b2500f8e7e6baa082e04563203b3a45" +uuid = "1af6417a-86b4-443c-805f-a4643ffb695f" +version = "0.3.5" + +[[deps.CUDA_Runtime_jll]] +deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "e43727b237b2879a34391eeb81887699a26f8f2f" +uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" +version = "0.15.3+0" + [[deps.Cairo_jll]] deps = ["Artifacts", "Bzip2_jll", "CompilerSupportLibraries_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"] git-tree-sha1 = "009060c9a6168704143100f36ab08f06c2af4642" @@ -223,6 +281,12 @@ git-tree-sha1 = "05ba0d07cd4fd8b7a39541e31a7b0254704ea581" uuid = "fb6a15b2-703c-40df-9091-08a04967cfa9" version = "0.1.13" +[[deps.CodeTracking]] +deps = ["InteractiveUtils", "UUIDs"] +git-tree-sha1 = "7eee164f122511d3e4e1ebadb7956939ea7e1c77" +uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2" +version = "1.3.6" + [[deps.CodecZlib]] deps = ["TranscodingStreams", "Zlib_jll"] git-tree-sha1 = "bce6804e5e6044c6daab27bb533d1295e4a2e759" @@ -334,11 +398,28 @@ git-tree-sha1 = "fcbb72b032692610bfbdb15018ac16a36cf2e406" uuid = "adafc99b-e345-5852-983c-f28acb93d879" version = "0.3.1" +[[deps.Crayons]] +git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" +uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" +version = "4.1.1" + +[[deps.Cthulhu]] +deps = ["CodeTracking", "FoldingTrees", "InteractiveUtils", "JuliaSyntax", "PrecompileTools", "Preferences", "REPL", "TypedSyntax", "UUIDs", "Unicode", "WidthLimitedIO"] +git-tree-sha1 = "9f167682b994c3d6f841d2d7cda444bf3b918e37" +uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f" +version = "2.15.2" + [[deps.DataAPI]] git-tree-sha1 = "abe83f3a2f1b857aac70ef8b269080af17764bbe" uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" version = "1.16.0" +[[deps.DataFrames]] +deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] +git-tree-sha1 = "fb61b4812c49343d7ef0b533ba982c46021938a6" +uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +version = "1.7.0" + [[deps.DataStructures]] deps = ["Compat", "InteractiveUtils", "OrderedCollections"] git-tree-sha1 = "1d0a14036acb104d9e89698bd408f63ab58cdc82" @@ -368,10 +449,10 @@ uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" version = "1.9.1" [[deps.DiffEqBase]] -deps = ["ArrayInterface", "ConcreteStructs", "DataStructures", "DocStringExtensions", "EnumX", "EnzymeCore", "FastBroadcast", "FastClosures", "ForwardDiff", "FunctionWrappers", "FunctionWrappersWrappers", "LinearAlgebra", "Logging", "Markdown", "MuladdMacro", "Parameters", "PreallocationTools", "PrecompileTools", "Printf", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "SciMLStructures", "Setfield", "Static", "StaticArraysCore", "Statistics", "Tricks", "TruncatedStacktraces"] -git-tree-sha1 = "8977ef8249b602e4cb46ddbaf3c51e6adc2958c7" +deps = ["ArrayInterface", "ConcreteStructs", "DataStructures", "DocStringExtensions", "EnumX", "EnzymeCore", "FastBroadcast", "FastClosures", "ForwardDiff", "FunctionWrappers", "FunctionWrappersWrappers", "LinearAlgebra", "Logging", "Markdown", "MuladdMacro", "Parameters", "PreallocationTools", "PrecompileTools", "Printf", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "SciMLStructures", "Setfield", "Static", "StaticArraysCore", "Statistics", "TruncatedStacktraces"] +git-tree-sha1 = "9c36ac68cf6c59a1c2569aadd7a62c47ed5c8eb5" uuid = "2b5f629d-d688-5b77-993f-72d75c75574e" -version = "6.157.0" +version = "6.158.1" [deps.DiffEqBase.extensions] DiffEqBaseCUDAExt = "CUDA" @@ -414,10 +495,10 @@ uuid = "b552c78f-8df3-52c6-915a-8e097449b14b" version = "1.15.1" [[deps.DifferentiationInterface]] -deps = ["ADTypes", "Compat", "LinearAlgebra", "PackageExtensionCompat"] -git-tree-sha1 = "75d1716eb46e1b77304f7d79ec9ac6a4e6185d02" +deps = ["ADTypes", "LinearAlgebra"] +git-tree-sha1 = "76ea82755a5c93b7b26d4fca990854fd2fc74b6e" uuid = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63" -version = "0.6.7" +version = "0.6.11" [deps.DifferentiationInterface.extensions] DifferentiationInterfaceChainRulesCoreExt = "ChainRulesCore" @@ -432,6 +513,7 @@ version = "0.6.7" DifferentiationInterfaceReverseDiffExt = "ReverseDiff" DifferentiationInterfaceSparseArraysExt = "SparseArrays" DifferentiationInterfaceSparseMatrixColoringsExt = "SparseMatrixColorings" + DifferentiationInterfaceStaticArraysExt = "StaticArrays" DifferentiationInterfaceSymbolicsExt = "Symbolics" DifferentiationInterfaceTrackerExt = "Tracker" DifferentiationInterfaceZygoteExt = ["Zygote", "ForwardDiff"] @@ -449,6 +531,7 @@ version = "0.6.7" ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SparseMatrixColorings = "0a514795-09f3-496d-8182-132a7b665d35" + StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7" Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" @@ -577,17 +660,21 @@ uuid = "29a986be-02c6-4525-aec4-84b980013641" version = "2.0.4" [[deps.Ferrite]] -deps = ["EnumX", "ForwardDiff", "LinearAlgebra", "NearestNeighbors", "OrderedCollections", "Preferences", "Reexport", "SparseArrays", "StaticArrays", "Tensors", "WriteVTK"] +deps = ["Cthulhu", "EnumX", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "NearestNeighbors", "OrderedCollections", "Preferences", "Reexport", "SparseArrays", "StaticArrays", "Tensors", "WriteVTK"] path = ".." uuid = "c061ca5d-56c9-439f-9c0e-210fe06d3992" version = "1.0.0" [deps.Ferrite.extensions] FerriteBlockArrays = "BlockArrays" + FerriteCuda = ["CUDA", "Adapt"] FerriteMetis = "Metis" [deps.Ferrite.weakdeps] + Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" + BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e" + CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b" [[deps.FerriteGmsh]] @@ -652,6 +739,12 @@ git-tree-sha1 = "05882d6995ae5c12bb5f36dd2ed3f61c98cbb172" uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" version = "0.8.5" +[[deps.FoldingTrees]] +deps = ["AbstractTrees", "REPL"] +git-tree-sha1 = "d94efd85f2fe192cdf664aa8b7c431592faed59e" +uuid = "1eca21be-9b9b-4ed8-839a-6d8ae26b1781" +version = "1.2.1" + [[deps.Fontconfig_jll]] deps = ["Artifacts", "Bzip2_jll", "Expat_jll", "FreeType2_jll", "JLLWrappers", "Libdl", "Libuuid_jll", "Zlib_jll"] git-tree-sha1 = "db16beca600632c95fc8aca29890d83788dd8b23" @@ -718,12 +811,24 @@ deps = ["Artifacts", "Libdl"] uuid = "781609d7-10c4-51f6-84f2-b8444358ff6d" version = "6.3.0+0" +[[deps.GPUArrays]] +deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] +git-tree-sha1 = "62ee71528cca49be797076a76bdc654a170a523e" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "10.3.1" + [[deps.GPUArraysCore]] deps = ["Adapt"] git-tree-sha1 = "ec632f177c0d990e64d955ccc1b8c04c485a0950" uuid = "46192b85-c4d5-4398-a991-12ede77f4527" version = "0.1.6" +[[deps.GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "PrecompileTools", "Preferences", "Scratch", "Serialization", "TOML", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "1d6f290a5eb1201cd63574fbc4440c788d5cb38f" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.27.8" + [[deps.GR]] deps = ["Artifacts", "Base64", "DelimitedFiles", "Downloads", "GR_jll", "HTTP", "JSON", "Libdl", "LinearAlgebra", "Preferences", "Printf", "Qt6Wayland_jll", "Random", "Serialization", "Sockets", "TOML", "Tar", "Test", "p7zip_jll"] git-tree-sha1 = "629693584cef594c3f6f99e76e7a7ad17e60e8d5" @@ -840,6 +945,19 @@ git-tree-sha1 = "4da0f88e9a39111c2fa3add390ab15f3a44f3ca3" uuid = "22cec73e-a1b8-11e9-2c92-598750a2cf9c" version = "0.3.1" +[[deps.InlineStrings]] +git-tree-sha1 = "45521d31238e87ee9f9732561bfee12d4eebd52d" +uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" +version = "1.4.2" + + [deps.InlineStrings.extensions] + ArrowTypesExt = "ArrowTypes" + ParsersExt = "Parsers" + + [deps.InlineStrings.weakdeps] + ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd" + Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" + [[deps.IntelOpenMP_jll]] deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl"] git-tree-sha1 = "10bd689145d2c3b2a9844005d01087cc1194e79e" @@ -861,6 +979,11 @@ weakdeps = ["Dates", "Test"] InverseFunctionsDatesExt = "Dates" InverseFunctionsTestExt = "Test" +[[deps.InvertedIndices]] +git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038" +uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" +version = "1.3.0" + [[deps.IrrationalConstants]] git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2" uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" @@ -885,9 +1008,9 @@ version = "0.1.8" [[deps.JLLWrappers]] deps = ["Artifacts", "Preferences"] -git-tree-sha1 = "f389674c99bfcde17dc57454011aa44d5a260a40" +git-tree-sha1 = "be3dc50a92e5a386872a493a10050136d4703f9b" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.6.0" +version = "1.6.1" [[deps.JSON]] deps = ["Dates", "Mmap", "Parsers", "Unicode"] @@ -919,12 +1042,35 @@ git-tree-sha1 = "25ee0be4d43d0269027024d75a24c24d6c6e590c" uuid = "aacddb02-875f-59d6-b918-886e6ef4fbf8" version = "3.0.4+0" +[[deps.JuliaNVTXCallbacks_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f" +uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e" +version = "0.2.1+0" + +[[deps.JuliaSyntax]] +git-tree-sha1 = "937da4713526b96ac9a178e2035019d3b78ead4a" +uuid = "70703baa-626e-46a2-a12c-08ffd08c73b4" +version = "0.4.10" + [[deps.KLU]] deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse_jll"] git-tree-sha1 = "07649c499349dad9f08dde4243a4c597064663e9" uuid = "ef3ab10e-7fda-4108-b977-705223b18434" version = "0.6.0" +[[deps.KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "MacroTools", "PrecompileTools", "Requires", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "04e52f596d0871fa3890170fa79cb15e481e4cd8" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.9.28" +weakdeps = ["EnzymeCore", "LinearAlgebra", "SparseArrays"] + + [deps.KernelAbstractions.extensions] + EnzymeExt = "EnzymeCore" + LinearAlgebraExt = "LinearAlgebra" + SparseArraysExt = "SparseArrays" + [[deps.Krylov]] deps = ["LinearAlgebra", "Printf", "SparseArrays"] git-tree-sha1 = "267dad6b4b7b5d529c76d40ff48d33f7e94cb834" @@ -943,6 +1089,27 @@ git-tree-sha1 = "bf36f528eec6634efc60d7ec062008f171071434" uuid = "88015f11-f218-50d7-93a8-a6af411a945d" version = "3.0.0+1" +[[deps.LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Preferences", "Printf", "Requires", "Unicode"] +git-tree-sha1 = "4ad43cb0a4bb5e5b1506e1d1f48646d7e0c80363" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "9.1.2" +weakdeps = ["BFloat16s"] + + [deps.LLVM.extensions] + BFloat16sExt = "BFloat16s" + +[[deps.LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "05a8bd5a42309a9ec82f700876903abce1017dd3" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.34+0" + +[[deps.LLVMLoopInfo]] +git-tree-sha1 = "2e5c102cfc41f48ae4740c7eca7743cc7e7b75ea" +uuid = "8b046642-f1f6-4319-8d3c-209ddc03c586" +version = "1.0.0" + [[deps.LLVMOpenMP_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] git-tree-sha1 = "78211fb6cbc872f77cad3fc0b6cf647d923f4929" @@ -1330,6 +1497,18 @@ git-tree-sha1 = "a0b464d183da839699f4c79e7606d9d186ec172c" uuid = "d41bc354-129a-5804-8e4c-c37616107c6c" version = "7.8.3" +[[deps.NVTX]] +deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"] +git-tree-sha1 = "53046f0483375e3ed78e49190f1154fa0a4083a1" +uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f" +version = "0.3.4" + +[[deps.NVTX_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b" +uuid = "e98f9f5b-d649-5603-91fd-7774390e6439" +version = "3.1.0+2" + [[deps.NaNMath]] deps = ["OpenLibm_jll"] git-tree-sha1 = "0877504529a3e5c3343c6f8b4c0381e57e4387e4" @@ -1478,9 +1657,9 @@ version = "1.1.2" [[deps.OrdinaryDiffEqCore]] deps = ["ADTypes", "Accessors", "Adapt", "ArrayInterface", "DataStructures", "DiffEqBase", "DocStringExtensions", "EnumX", "FastBroadcast", "FastClosures", "FillArrays", "FunctionWrappersWrappers", "InteractiveUtils", "LinearAlgebra", "Logging", "MacroTools", "MuladdMacro", "Polyester", "PrecompileTools", "Preferences", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "SciMLStructures", "SimpleUnPack", "Static", "StaticArrayInterface", "StaticArraysCore", "TruncatedStacktraces"] -git-tree-sha1 = "33e4292e832d439c3706410ae867f3c091d79155" +git-tree-sha1 = "f4b7b11e2e4c0f4a7fe15d2edcec1e1ce2917d67" uuid = "bbf590c4-e513-4bbe-9b18-05decba2e5d8" -version = "1.7.0" +version = "1.7.1" weakdeps = ["EnzymeCore"] [deps.OrdinaryDiffEqCore.extensions] @@ -1741,6 +1920,12 @@ git-tree-sha1 = "645bed98cd47f72f67316fd42fc47dee771aefcd" uuid = "1d0040c9-8b98-4ee7-8388-3f51789ca0ad" version = "0.2.2" +[[deps.PooledArrays]] +deps = ["DataAPI", "Future"] +git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3" +uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" +version = "1.4.3" + [[deps.PositiveFactorizations]] deps = ["LinearAlgebra"] git-tree-sha1 = "17275485f373e6673f7e7f97051f703ed5b15b20" @@ -1771,6 +1956,12 @@ git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6" uuid = "21216c6a-2e73-6563-6e65-726566657250" version = "1.4.3" +[[deps.PrettyTables]] +deps = ["Crayons", "LaTeXStrings", "Markdown", "PrecompileTools", "Printf", "Reexport", "StringManipulation", "Tables"] +git-tree-sha1 = "1101cd475833706e4d0e7b122218257178f48f34" +uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" +version = "2.4.0" + [[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -1816,6 +2007,18 @@ deps = ["SHA"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" version = "1.11.0" +[[deps.Random123]] +deps = ["Random", "RandomNumbers"] +git-tree-sha1 = "4743b43e5a9c4a2ede372de7061eed81795b12e7" +uuid = "74087812-796a-5b5d-8853-05524746bad3" +version = "1.7.0" + +[[deps.RandomNumbers]] +deps = ["Random"] +git-tree-sha1 = "c6ec94d2aaba1ab2ff983052cf6a606ca5985902" +uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" +version = "1.6.0" + [[deps.RecipesBase]] deps = ["PrecompileTools"] git-tree-sha1 = "5c3d09cc4f31f5fc6af001c250bf1278733100ff" @@ -1970,6 +2173,12 @@ git-tree-sha1 = "3bac05bc7e74a75fd9cba4295cde4045d9fe2386" uuid = "6c6a2e73-6563-6170-7368-637461726353" version = "1.2.1" +[[deps.SentinelArrays]] +deps = ["Dates", "Random"] +git-tree-sha1 = "ff11acffdb082493657550959d4feb4b6149e73a" +uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c" +version = "1.4.5" + [[deps.Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" version = "1.11.0" @@ -2041,10 +2250,10 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" version = "1.11.0" [[deps.SparseConnectivityTracer]] -deps = ["ADTypes", "DocStringExtensions", "FillArrays", "LinearAlgebra", "Random", "Requires", "SparseArrays"] -git-tree-sha1 = "e5efbf152d3e14a513f19a9119b810340b7ac86b" +deps = ["ADTypes", "DocStringExtensions", "FillArrays", "LinearAlgebra", "Random", "SparseArrays"] +git-tree-sha1 = "35d346f6aa78814859f1f69cd4f41c24108afa00" uuid = "9f842d2f-2579-4b1d-911e-f412cf18a3f5" -version = "0.6.6" +version = "0.6.7" [deps.SparseConnectivityTracer.extensions] SparseConnectivityTracerDataInterpolationsExt = "DataInterpolations" @@ -2061,10 +2270,10 @@ version = "0.6.6" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" [[deps.SparseDiffTools]] -deps = ["ADTypes", "Adapt", "ArrayInterface", "Compat", "DataStructures", "FiniteDiff", "ForwardDiff", "Graphs", "LinearAlgebra", "PackageExtensionCompat", "Random", "Reexport", "SciMLOperators", "Setfield", "SparseArrays", "StaticArrayInterface", "StaticArrays", "Tricks", "UnPack", "VertexSafeGraphs"] -git-tree-sha1 = "7db12cef226aaa8ca730040c9c35e32b86a69b83" +deps = ["ADTypes", "Adapt", "ArrayInterface", "Compat", "DataStructures", "FiniteDiff", "ForwardDiff", "Graphs", "LinearAlgebra", "PackageExtensionCompat", "Random", "Reexport", "SciMLOperators", "Setfield", "SparseArrays", "StaticArrayInterface", "StaticArrays", "UnPack", "VertexSafeGraphs"] +git-tree-sha1 = "b906758c107b049b6b71599b9f928d9b14e5554a" uuid = "47a9eef4-7e08-11e9-0b38-333d64bd3804" -version = "2.22.0" +version = "2.23.0" [deps.SparseDiffTools.extensions] SparseDiffToolsEnzymeExt = "Enzyme" @@ -2081,10 +2290,14 @@ version = "2.22.0" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [[deps.SparseMatrixColorings]] -deps = ["ADTypes", "Compat", "DataStructures", "DocStringExtensions", "LinearAlgebra", "Random", "SparseArrays"] -git-tree-sha1 = "ad17e2069015839e58a1f9275608b405fae1f28e" +deps = ["ADTypes", "DataStructures", "DocStringExtensions", "LinearAlgebra", "Random", "SparseArrays"] +git-tree-sha1 = "ccc32032d8f6790ad198c99fb8ef20d8100a0de4" uuid = "0a514795-09f3-496d-8182-132a7b665d35" -version = "0.4.6" +version = "0.4.7" +weakdeps = ["Colors"] + + [deps.SparseMatrixColorings.extensions] + SparseMatrixColoringsColorsExt = "Colors" [[deps.Sparspak]] deps = ["Libdl", "LinearAlgebra", "Logging", "OffsetArrays", "Printf", "SparseArrays", "Test"] @@ -2174,6 +2387,12 @@ git-tree-sha1 = "b765e46ba27ecf6b44faf70df40c57aa3a547dcb" uuid = "69024149-9ee7-55f6-a4c4-859efe599b68" version = "0.3.7" +[[deps.StringManipulation]] +deps = ["PrecompileTools"] +git-tree-sha1 = "a6b1675a536c5ad1a60e5a5153e1fee12eb146e3" +uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e" +version = "0.4.0" + [[deps.StructTypes]] deps = ["Dates", "UUIDs"] git-tree-sha1 = "159331b30e94d7b11379037feeb9b690950cace8" @@ -2262,17 +2481,18 @@ git-tree-sha1 = "be986ad9dac14888ba338c2554dcfec6939e1393" uuid = "d5829a12-d9aa-46ab-831f-fb7c9ab06edf" version = "0.2.1" -[[deps.Tricks]] -git-tree-sha1 = "7822b97e99a1672bfb1b49b668a6d46d58d8cbcb" -uuid = "410a4b4d-49e4-4fbc-ab6d-cb71b17b3775" -version = "0.1.9" - [[deps.TruncatedStacktraces]] deps = ["InteractiveUtils", "MacroTools", "Preferences"] git-tree-sha1 = "ea3e54c2bdde39062abf5a9758a23735558705e1" uuid = "781d530d-4396-4725-bb49-402e4bee1e77" version = "1.4.0" +[[deps.TypedSyntax]] +deps = ["CodeTracking", "JuliaSyntax"] +git-tree-sha1 = "1465a8187b3d512a99fef13244c213b54e34615d" +uuid = "d265eb64-f81a-44ad-a842-4247ee1503de" +version = "1.4.2" + [[deps.URIs]] git-tree-sha1 = "67db6cc7b3821e19ebe75791a9dd19c9b1188f2b" uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" @@ -2315,6 +2535,17 @@ git-tree-sha1 = "975c354fcd5f7e1ddcc1f1a23e6e091d99e99bc8" uuid = "45397f5d-5981-4c77-b2b3-fc36d6e9b728" version = "1.6.4" +[[deps.UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" +version = "0.2.1" + +[[deps.UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "2d17fabcd17e67d7625ce9c531fb9f40b7c42ce4" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.2.1" + [[deps.Unzip]] git-tree-sha1 = "ca0969166a028236229f63514992fc073799bb78" uuid = "41fe7b60-77ed-43a1-b4f0-825fd5a5650d" @@ -2355,6 +2586,12 @@ git-tree-sha1 = "93f43ab61b16ddfb2fd3bb13b3ce241cafb0e6c9" uuid = "2381bf8a-dfd0-557d-9999-79630e7b1b91" version = "1.31.0+0" +[[deps.WidthLimitedIO]] +deps = ["Unicode"] +git-tree-sha1 = "71142739e695823729a335e9bc124ef41ec1433a" +uuid = "b8c1c048-cf81-46c6-9da0-18c1d99e41f2" +version = "1.0.1" + [[deps.WriteVTK]] deps = ["Base64", "CodecZlib", "FillArrays", "LightXML", "TranscodingStreams", "VTKBase"] git-tree-sha1 = "e453383a1486a9020a3323bf3665ea31106ebe9a" @@ -2546,6 +2783,12 @@ git-tree-sha1 = "555d1076590a6cc2fdee2ef1469451f872d8b41b" uuid = "3161d3a3-bdf6-5164-811a-617609db77b4" version = "1.5.6+1" +[[deps.demumble_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "6498e3581023f8e530f34760d18f75a69e3a4ea8" +uuid = "1e29f10c-031c-5a83-9565-69cddfc27673" +version = "1.3.0+0" + [[deps.eudev_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gperf_jll"] git-tree-sha1 = "431b678a28ebb559d224c0b6b6d01afce87c51ba" diff --git a/docs/Project.toml b/docs/Project.toml index ead8045637..8b56fa1c2e 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,5 +1,7 @@ [deps] +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Changelog = "5217a498-cd5d-4ec6-b8c2-9b85a09b6e3e" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244" diff --git a/docs/src/literate-tutorials/gpu_qp_heat_equation.jl b/docs/src/literate-tutorials/gpu_qp_heat_equation.jl new file mode 100644 index 0000000000..bb0699b26b --- /dev/null +++ b/docs/src/literate-tutorials/gpu_qp_heat_equation.jl @@ -0,0 +1,202 @@ +using Ferrite +using StaticArrays +using SparseArrays +using CUDA + + +left = Tensor{1, 2, Float32}((0, -0)) # define the left bottom corner of the grid. +right = Tensor{1, 2, Float32}((1.0, 1.0)) # define the right top corner of the grid. +grid = generate_grid(Quadrilateral, (20, 20), left, right) + + +ip = Lagrange{RefQuadrilateral, 2}() # define the interpolation function (i.e. Bilinear lagrange) +qr = QuadratureRule{RefQuadrilateral}(Float32, 3) +cellvalues = CellValues(Float32, qr, ip) + + +dh = DofHandler(grid) +add!(dh, :u, ip) +close!(dh); + + +# Standard assembly of the element. +function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues) + n_basefuncs = getnbasefunctions(cellvalues) + ## Loop over quadrature points + for q_point in 1:getnquadpoints(cellvalues) + ## Get the quadrature weight + dΩ = getdetJdV(cellvalues, q_point) + ## Loop over test shape functions + for i in 1:n_basefuncs + δu = shape_value(cellvalues, q_point, i) + ∇δu = shape_gradient(cellvalues, q_point, i) + ## Add contribution to fe + fe[i] += δu * dΩ + ## Loop over trial shape functions + for j in 1:n_basefuncs + ∇u = shape_gradient(cellvalues, q_point, j) + ## Add contribution to Ke + Ke[i, j] += (∇δu ⋅ ∇u) * dΩ + end + end + end + return Ke, fe +end + + +function assemble_element_qpiter!(Ke::Matrix, fe::Vector, cellvalues, cell_coords::AbstractVector) + n_basefuncs = getnbasefunctions(cellvalues) + ## Loop over quadrature points + for qv in Ferrite.QuadratureValuesIterator(cellvalues, cell_coords) + ## Get the quadrature weight + dΩ = getdetJdV(qv) + ## Loop over test shape functions + for i in 1:n_basefuncs + δu = shape_value(qv, i) + ∇δu = shape_gradient(qv, i) + ## Add contribution to fe + fe[i] += δu * dΩ + ## Loop over trial shape functions + for j in 1:n_basefuncs + ∇u = shape_gradient(qv, j) + ## Add contribution to Ke + Ke[i, j] += (∇δu ⋅ ∇u) * dΩ + end + end + end + return Ke, fe +end + +# Standard global assembly +function assemble_global_qp!(cellvalues, dh::DofHandler, K, f) + n_basefuncs = getnbasefunctions(cellvalues) + Ke = zeros(eltype(K), n_basefuncs, n_basefuncs) + fe = zeros(eltype(f), n_basefuncs) + assembler = start_assemble(K, f) + ## Loop over all cels + for cell in CellIterator(dh) + fill!(Ke, 0) + fill!(fe, 0) + assemble_element_qpiter!(Ke, fe, cellvalues, getcoordinates(cell)) + ## Assemble Ke and fe into K and f + assemble!(assembler, celldofs(cell), Ke, fe) + end + return K, f +end + +function assemble_global_std!(cellvalues, dh::DofHandler, K, f) + n_basefuncs = getnbasefunctions(cellvalues) + Ke = zeros(eltype(K), n_basefuncs, n_basefuncs) + fe = zeros(eltype(f), n_basefuncs) + assembler = start_assemble(K, f) + ## Loop over all cels + for cell in CellIterator(dh) + fill!(Ke, 0) + fill!(fe, 0) + ## Reinitialize cellvalues for this cell + reinit!(cellvalues, cell) + ## Compute element contribution + assemble_element_std!(Ke, fe, cellvalues) + ## Assemble Ke and fe into K and f + assemble!(assembler, celldofs(cell), Ke, fe) + end + return K, f +end + + +## gpu version of element assembly +function assemble_element!(Ke, fe, cv, cell) + n_basefuncs = getnbasefunctions(cv) + for qv in Ferrite.QuadratureValuesIterator(cv, getcoordinates(cell)) + ## Get the quadrature weight + dΩ = getdetJdV(qv) + ## Loop over test shape functions + for i in 1:n_basefuncs + δu = shape_value(qv, i) + ∇u = shape_gradient(qv, i) + ## Add contribution to fe + fe[i] += δu * dΩ + ## fe_shared[tx,i] += δu * dΩ + ## Loop over trial shape functions + for j in 1:n_basefuncs + ∇δu = shape_gradient(qv, j) + ## Add contribution to Ke + Ke[i, j] += (∇δu ⋅ ∇u) * dΩ + end + end + end + return +end + + +## gpu version of global assembly +function assemble_gpu!(Kgpu, fgpu, dh, cv; mem_alloc::AbstractMemAlloc) + assembler = start_assemble(Kgpu, fgpu; fillzero = false) ## has to be always false + for cell in CellIterator(dh, mem_alloc) + Ke = cellke(cell) + fe = cellfe(cell) + assemble_element!(Ke, fe, cv, cell) + assemble!(assembler, celldofs(cell), Ke, fe) + end + return nothing +end + + +n_basefuncs = getnbasefunctions(cellvalues) |> Int32 + +## Allocate CPU matrix +## K = allocate_matrix(SparseMatrixCSC{Float64, Int64}, dh); +#f = zeros(eltype(K), ndofs(dh)); + + +## Allocate GPU matrix +## commented to pass the test +## Kgpu = allocate_matrix(CUSPARSE.CuSparseMatrixCSC{Float32, Int32}, dh) +## fgpu = CUDA.zeros(Float32, ndofs(dh)); + +n_cells = dh |> get_grid |> getncells |> Int32 + + +# Kernel configuration +## GPU kernel ## +## commented to pass the test +## First init the kernel with the required config. +## gpu_kernel = init_kernel(BackendCUDA, n_cells, n_basefuncs, assemble_gpu!, (Kgpu, fgpu, dh, cellvalues)); +## Then launch the kernel +## gpu_kernel |> launch! or gpu_kernel() +## gpu_kernel() + +## CPU kernel ## +## cpu_kernel = init_kernel(BackendCPU, n_cells, n_basefuncs, assemble_gpu!, (K, f, cellvalues, dh)); +## cpu_kernel() + + +## commented to pass the test +## norm(Kgpu) + + +## GPU Benchmarking, remove when not needed ## +## function setup_bench_gpu(n_cells, n_basefuncs, cellvalues, dh) +## Kgpu = allocate_matrix(CUSPARSE.CuSparseMatrixCSC{Float32, Int32}, dh) +## fgpu = CUDA.zeros(eltype(Kgpu), ndofs(dh)); +## gpu_kernel = init_kernel(BackendCUDA, n_cells, n_basefuncs, assemble_gpu!, (Kgpu, fgpu, cellvalues, dh)) +## end + +## CUDA.@time setup_bench_gpu(n_cells, n_basefuncs, cellvalues, dh) +## CUDA.@profile trace = true setup_bench_gpu(n_cells, n_basefuncs, cellvalues, dh) +## CUDA.@time gpu_kernel() +## CUDA.@profile trace = true gpu_kernel() + + +# ## CPU Benchmarking, remove when not needed ## +## function setup_bench_cpu( dh) +## K = allocate_matrix(SparseMatrixCSC{Float64, Int}, dh) +## f = zeros(eltype(K), ndofs(dh)); +## return K,f +## end + +## using BenchmarkTools +## @benchmark setup_bench_cpu($dh) +## K,f = setup_bench_cpu(dh) +## @benchmark assemble_global_std!($cellvalues, $dh, $K, $f) +## @benchmark assemble_global_qp!($cellvalues, $dh, $K, $f) diff --git a/ext/FerriteCuda.jl b/ext/FerriteCuda.jl new file mode 100644 index 0000000000..47f9d5c8e4 --- /dev/null +++ b/ext/FerriteCuda.jl @@ -0,0 +1,21 @@ +module FerriteCuda +# This module represnets an extenssion of Ferrite.jl that uses GPU backend for assembly, namely CUDA.jl + +using Ferrite +using CUDA +using Adapt +using Base: + @propagate_inbounds +using SparseArrays: + AbstractSparseArray, SparseMatrixCSC +using StaticArrays: + SVector, MVector + + +include("GPU/gpu_assembler.jl") +include("GPU/cuda_mem_alloc.jl") +include("GPU/CUDAKernelLauncher.jl") +include("GPU/cuda_iterator.jl") +include("GPU/adapt.jl") +include("GPU/cuda_sparsity_pattern.jl") +end diff --git a/ext/GPU/CUDAKernelLauncher.jl b/ext/GPU/CUDAKernelLauncher.jl new file mode 100644 index 0000000000..02d5ad85e1 --- /dev/null +++ b/ext/GPU/CUDAKernelLauncher.jl @@ -0,0 +1,87 @@ +## This file manifsts the launch of GPU kernel on CUDA backend ## +abstract type AbstractCudaKernel <: Ferrite.AbstractKernel{BackendCUDA} end + + +struct CudaKernel{MemAlloc <: AbstractCudaMemAlloc, Ti <: Integer} <: AbstractCudaKernel + n_cells::Ti + n_basefuncs::Ti + kernel::Function + args::Tuple + mem_alloc::MemAlloc + threads::Ti + blocks::Ti +end + + +function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer} + if CUDA.functional() + threads = convert(Ti, min(n_cells, 256)) + blocks = _calculate_nblocks(threads, n_cells) + adapted_args = _adapt_args(args) + + mem_alloc = try_allocate_shared_mem(Float32, threads, n_basefuncs) + mem_alloc isa Nothing || return CudaKernel(n_cells, n_basefuncs, kernel, adapted_args, mem_alloc, threads, blocks) + + # FIXME: mem_alloc adapted twice here and in launch! + mem_alloc = allocate_global_mem(Float32, n_cells, n_basefuncs) |> _adapt + return CudaKernel(n_cells, n_basefuncs, kernel, adapted_args, mem_alloc, threads, blocks) + else + throw(ArgumentError("CUDA is not functional, please check your GPU driver and CUDA installation")) + end +end + +""" + Ferrite.launch!(kernel::LazyKernel{Ti, BackendCUDA}) where {Ti} + +Launch a CUDA kernel encapsulated in a `LazyKernel` object. + +# Arguments +- `kernel::LazyKernel`: The kernel to be launched, along with its configuration. + +# Returns +- `nothing`: Indicates that the kernel was launched and synchronized successfully. +""" +function Ferrite.launch!(kernel::CudaKernel{SharedMemAlloc{N, M, Tv, Ti}, Ti}) where {N, M, Tv, Ti} + ker = kernel.kernel + args = kernel.args + blocks = kernel.blocks + threads = kernel.threads + shmem_size = mem_size(kernel.mem_alloc) + kwargs = (mem_alloc = kernel.mem_alloc,) + kernel_fun = () -> ker(args...; kwargs...) + CUDA.@sync @cuda blocks = blocks threads = threads shmem = shmem_size kernel_fun() + + return nothing +end + + +function Ferrite.launch!(kernel::CudaKernel{GlobalMemAlloc{LOCAL_MATRICES, LOCAL_VECTORS}, Ti}) where {LOCAL_MATRICES, LOCAL_VECTORS, Ti} + ker = kernel.kernel + args = kernel.args + blocks = kernel.blocks + threads = kernel.threads + kwargs = (mem_alloc = kernel.mem_alloc,) + kernel_fun = () -> ker(args...; kwargs...) + CUDA.@sync @cuda blocks = blocks threads = threads kernel_fun() + return nothing +end + +""" + _calculate_nblocks(threads::Integer, n_cells::Integer) + +Calculate the number of blocks required for kernel execution. + +# Arguments +- `threads::Integer`: Number of threads per block. +- `n_cells::Integer`: Total number of cells to process. + +# Returns +- `Integer`: Number of blocks to launch. +""" +function _calculate_nblocks(threads::Ti, n_cells::Ti) where {Ti <: Integer} + dev = device() + no_sms = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) + required_blocks = cld(n_cells, threads) + required_blocks < 2 * no_sms || return convert(Ti, 2 * no_sms) + return convert(Ti, required_blocks) +end diff --git a/ext/GPU/adapt.jl b/ext/GPU/adapt.jl new file mode 100644 index 0000000000..8779f1f7d1 --- /dev/null +++ b/ext/GPU/adapt.jl @@ -0,0 +1,108 @@ +# This file defines the adapt_structure function, which is used to adapt custom structures to be used on the GPU. + +Adapt.@adapt_structure GPUGrid +Adapt.@adapt_structure GPUDofHandler +Adapt.@adapt_structure GlobalMemAlloc + +function _adapt_args(args) + return tuple(((_adapt(arg) for arg in args) |> collect)...) +end + + +function _adapt(kgpu::CUSPARSE.CuSparseMatrixCSC) + # custom adaptation + return Adapt.adapt_structure(CUSPARSE.CuSparseDeviceMatrixCSC, kgpu) +end + + +function _adapt(obj::Any) + # fallback to the default implementation + return Adapt.adapt_structure(CuArray, obj) +end + +## Adapt GlobalMemAlloc +function Adapt.adapt_structure(to, mem_alloc::GlobalMemAlloc) + kes = Adapt.adapt_structure(to, mem_alloc.Kes) + fes = Adapt.adapt_structure(to, mem_alloc.fes) + return GlobalMemAlloc(kes, fes) +end + + +function Adapt.adapt_structure(to, cv::CellValues) + fv = Adapt.adapt(to, StaticInterpolationValues(cv.fun_values)) + gm = Adapt.adapt(to, StaticInterpolationValues(cv.geo_mapping)) + weights = Adapt.adapt(to, ntuple(i -> getweights(cv.qr)[i], getnquadpoints(cv))) + return Ferrite.StaticCellValues(fv, gm, weights) +end + + +function Adapt.adapt_structure(to, iter::Ferrite.QuadratureValuesIterator) + cv = Adapt.adapt_structure(to, iter.v) + cell_coords = Adapt.adapt_structure(to, iter.cell_coords) + return Ferrite.QuadratureValuesIterator(cv, cell_coords) +end + +function Adapt.adapt_structure(to, qv::Ferrite.StaticQuadratureValues) + det = Adapt.adapt_structure(to, qv.detJdV) + N = Adapt.adapt_structure(to, qv.N) + dNdx = Adapt.adapt_structure(to, qv.dNdx) + M = Adapt.adapt_structure(to, qv.M) + return Ferrite.StaticQuadratureValues(det, N, dNdx, M) +end +function Adapt.adapt_structure(to, qv::StaticQuadratureView) + mapping = Adapt.adapt_structure(to, qv.mapping) + cell_coords = Adapt.adapt_structure(to, qv.cell_coords |> cu) + q_point = Adapt.adapt_structure(to, qv.q_point) + cv = Adapt.adapt_structure(to, qv.cv) + return StaticQuadratureView(mapping, cell_coords, q_point, cv) +end + +function Adapt.adapt_structure(to, grid::Grid) + # map Int64 to Int32 to reduce number of registers + cu_cells = grid.cells .|> (x -> Int32.(x.nodes)) .|> Quadrilateral |> cu + cells = Adapt.adapt_structure(to, cu_cells) + nodes = Adapt.adapt_structure(to, cu(grid.nodes)) + return GPUGrid(cells, nodes) +end + + +function Adapt.adapt_structure(to, iterator::CUDACellIterator) + grid = Adapt.adapt_structure(to, iterator.grid) + dh = Adapt.adapt_structure(to, iterator.dh) + ncells = Adapt.adapt_structure(to, iterator.n_cells) + return GPUCellIterator(dh, grid, ncells) +end + + +function _get_ndofs_cell(dh::DofHandler) + ndofs_cell = [Int32(Ferrite.ndofs_per_cell(dh, i)) for i in 1:(dh |> Ferrite.get_grid |> Ferrite.getncells)] + return ndofs_cell +end + +_symbols_to_int32(symbols) = 1:length(symbols) .|> (sym -> convert(Int32, sym)) + +function Adapt.adapt_structure(to, sdh::SubDofHandler) + cellset = Adapt.adapt_structure(to, sdh.cellset |> collect .|> (x -> convert(Int32, x)) |> cu) + field_names = Adapt.adapt_structure(to, _symbols_to_int32(sdh.field_names) |> cu) + field_interpolations = sdh.field_interpolations .|> (ip -> Adapt.adapt_structure(to, ip)) |> cu + ndofs_per_cell = Adapt.adapt_structure(to, sdh.ndofs_per_cell) + return GPUSubDofHandler(cellset, field_names, field_interpolations, ndofs_per_cell) +end + +function Adapt.adapt_structure(to, dh::DofHandler) + subdofhandlers = dh.subdofhandlers .|> (sdh -> Adapt.adapt_structure(to, sdh)) |> cu + cell_dofs = Adapt.adapt_structure(to, dh.cell_dofs .|> (x -> convert(Int32, x)) |> cu) + cells = Adapt.adapt_structure(to, dh.grid.cells |> cu) + offsets = Adapt.adapt_structure(to, dh.cell_dofs_offset .|> Int32 |> cu) + nodes = Adapt.adapt_structure(to, dh.grid.nodes |> cu) + #ndofs_cell = Adapt.adapt_structure(to, _get_ndofs_cell(dh) |> cu) + cell_to_subdofhandler = Adapt.adapt_structure(to, dh.cell_to_subdofhandler .|> (x -> convert(Int32, x)) |> cu) + return GPUDofHandler(subdofhandlers, cell_dofs, GPUGrid(cells, nodes), offsets, cell_to_subdofhandler) +end + +## TODO: remove this...not needed +function Adapt.adapt_structure(to, assembler::GPUAssemblerSparsityPattern) + K = Adapt.adapt_structure(to, assembler.K) + f = Adapt.adapt_structure(to, assembler.f) + return Ferrite.GPUAssemblerSparsityPattern(K, f) +end diff --git a/ext/GPU/cuda_iterator.jl b/ext/GPU/cuda_iterator.jl new file mode 100644 index 0000000000..9f193bfedc --- /dev/null +++ b/ext/GPU/cuda_iterator.jl @@ -0,0 +1,188 @@ +##### GPUCellIterator ##### + +""" + AbstractCUDACellIterator <: Ferrite.AbstractKernelCellIterator + +Abstract type representing CUDA cell iterators for finite element computations +on the GPU. It provides the base for implementing multiple cell iteration strategies (e.g. with shared memory, with global memory). +""" +abstract type AbstractCUDACellIterator <: Ferrite.AbstractKernelCellIterator end + + +ncells(iterator::AbstractCUDACellIterator) = iterator.n_cells ## any subtype has to have `n_cells` field + + +struct CUDACellIterator{DH <: Ferrite.GPUDofHandler, GRID <: Ferrite.AbstractGPUGrid, Ti <: Integer, MatrixType, VectorType} <: AbstractCUDACellIterator + dh::DH + grid::GRID + n_cells::Ti + cell_ke::MatrixType + cell_fe::VectorType +end + +struct CudaOutOfBoundCellIterator <: AbstractCUDACellIterator end # used to handle the case for out of bound threads (global memory only) + + +function Ferrite.CellIterator(dh::Ferrite.GPUDofHandler, buffer_alloc::GlobalMemAlloc) + grid = get_grid(dh) + n_cells = grid |> getncells |> Int32 + bd = blockDim().x + local_thread_id = threadIdx().x + global_thread_id = (blockIdx().x - Int32(1)) * bd + local_thread_id + global_thread_id <= n_cells || return CudaOutOfBoundCellIterator() + cell_ke = cellke(buffer_alloc, global_thread_id) + cell_fe = cellfe(buffer_alloc, global_thread_id) + return CUDACellIterator(dh, grid, n_cells, cell_ke, cell_fe) +end + + +function Ferrite.CellIterator(dh::Ferrite.GPUDofHandler, buffer_alloc::SharedMemAlloc) + grid = get_grid(dh) + n_cells = grid |> getncells |> Int32 + block_ke = buffer_alloc.Ke() + block_fe = buffer_alloc.fe() + local_thread_id = threadIdx().x + cell_ke = @view block_ke[local_thread_id, :, :] + cell_fe = @view block_fe[local_thread_id, :] + return CUDACellIterator(dh, grid, n_cells, cell_ke, cell_fe) +end + + +function Base.iterate(iterator::CUDACellIterator) + i = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x + i <= ncells(iterator) || return nothing + return (_makecache(iterator, i), i) +end + + +function Base.iterate(iterator::CUDACellIterator, state) + stride = blockDim().x * gridDim().x + i = state + stride + i <= ncells(iterator) || return nothing + return (_makecache(iterator, i), i) +end + + +Base.iterate(::CudaOutOfBoundCellIterator) = nothing + +Base.iterate(::CudaOutOfBoundCellIterator, state) = nothing # I believe this is not necessary + + +struct GPUCellCache{Ti <: Integer, DOFS <: AbstractVector{Ti}, NN, NODES <: SVector{NN, Ti}, X, COORDS <: SVector{X}, MatrixType, VectorType} <: Ferrite.AbstractKernelCellCache + coords::COORDS + dofs::DOFS + cellid::Ti + nodes::NODES + ke::MatrixType + fe::VectorType +end + + +function _makecache(iterator::AbstractCUDACellIterator, e::Integer) + dh = iterator.dh + grid = iterator.grid + cellid = e + cell = Ferrite.getcells(grid, e) + + # Extract the node IDs of the cell. + nodes = SVector(convert.(Int32, Ferrite.get_node_ids(cell))...) + + # Extract the degrees of freedom for the cell. + dofs = Ferrite.celldofs(dh, e) + + # Get the coordinates of the nodes of the cell. + CT = Ferrite.get_coordinate_type(grid) + N = Ferrite.nnodes(cell) + x = MVector{N, CT}(undef) + for i in eachindex(x) + x[i] = Ferrite.get_node_coordinate(grid, nodes[i]) + end + coords = SVector(x...) + + # Return the GPUCellCache containing the cell's data. + return GPUCellCache(coords, dofs, cellid, nodes, iterator.cell_ke, iterator.cell_fe) +end + +""" + getnodes(cc::GPUCellCache) + +Return the node IDs associated with the current cell in the cache. + +# Arguments +- `cc`: The `GPUCellCache` object. + +# Returns +The node IDs of the current cell. +""" +Ferrite.getnodes(cc::GPUCellCache) = cc.nodes + +""" + getcoordinates(cc::GPUCellCache) + +Return the coordinates of the current cell's nodes. + +# Arguments +- `cc`: The `GPUCellCache` object. + +# Returns +The coordinates of the nodes of the current cell. +""" +Ferrite.getcoordinates(cc::GPUCellCache) = cc.coords + +""" + celldofs(cc::GPUCellCache) + +Return the degrees of freedom (DoFs) for the current cell from the cache. + +# Arguments +- `cc`: The `GPUCellCache` object. + +# Returns +The degrees of freedom (DoFs) associated with the current cell. +""" +Ferrite.celldofs(cc::GPUCellCache) = cc.dofs + +""" + cellid(cc::GPUCellCache) + +Return the ID of the current cell stored in the cache. + +# Arguments +- `cc`: The `GPUCellCache` object. + +# Returns +The ID of the current cell. +""" +Ferrite.cellid(cc::GPUCellCache) = cc.cellid + +""" + Ferrite.cellke(cc::GPUCellCache) + +Access the stiffness matrix (`ke`) of the current cell from shared memory and reset it to zero. + +# Arguments +- `cc`: The `GPUCellCache` object. + +# Returns +The stiffness matrix filled with zeros. +""" +@inline function Ferrite.cellke(cc::GPUCellCache) + ke = cc.ke + return CUDA.fill!(ke, 0.0f0) +end + +""" + Ferrite.cellfe(cc::GPUCellCache) + +Access the force vector (`fe`) of the current cell from shared memory and reset it to zero. + +# Arguments +- `cc`: The `GPUCellCache` object. + +# Returns +The force vector filled with zeros. +""" +@inline function Ferrite.cellfe(cc::GPUCellCache) + fe = cc.fe + return CUDA.fill!(fe, 0.0f0) +end diff --git a/ext/GPU/cuda_mem_alloc.jl b/ext/GPU/cuda_mem_alloc.jl new file mode 100644 index 0000000000..dc30c2040f --- /dev/null +++ b/ext/GPU/cuda_mem_alloc.jl @@ -0,0 +1,53 @@ +struct DynamicSharedMemFunction{N, Tv <: Real, Ti <: Integer} + mem_size::NTuple{N, Ti} + offset::Ti +end + + +function (dsf::DynamicSharedMemFunction{N, Tv, Ti})() where {N, Tv, Ti} + mem_size = dsf.mem_size + offset = dsf.offset + return @cuDynamicSharedMem(Tv, mem_size, offset) +end + +abstract type AbstractCudaMemAlloc <: AbstractMemAlloc end + +struct SharedMemAlloc{N, M, Tv <: Real, Ti <: Integer} <: AbstractCudaMemAlloc + Ke::DynamicSharedMemFunction{N, Tv, Ti} ## block level allocation (i.e. each block will execute this function) + fe::DynamicSharedMemFunction{M, Tv, Ti} ## block level allocation (i.e. each block will execute this function) + tot_mem_size::Ti +end + +mem_size(alloc::SharedMemAlloc) = alloc.tot_mem_size + + +function _can_use_dynshmem(required_shmem::Integer) + dev = device() + MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK) + return required_shmem < MAX_DYN_SHMEM +end + + +function try_allocate_shared_mem(::Type{Tv}, block_dim::Ti, n_basefuncs::Ti) where {Ti <: Integer, Tv <: Real} + shared_mem = convert(Ti, sizeof(Tv) * (block_dim) * (n_basefuncs) * n_basefuncs + sizeof(Tv) * (block_dim) * n_basefuncs) + _can_use_dynshmem(shared_mem) || return nothing + Ke = DynamicSharedMemFunction{3, Tv, Ti}((block_dim, n_basefuncs, n_basefuncs), convert(Ti, 0)) + fe = DynamicSharedMemFunction{2, Tv, Ti}((block_dim, n_basefuncs), convert(Ti, sizeof(Tv) * block_dim * n_basefuncs * n_basefuncs)) + return SharedMemAlloc(Ke, fe, shared_mem) +end + + +struct GlobalMemAlloc{LOCAL_MATRICES, LOCAL_VECTORS} <: AbstractCudaMemAlloc + Kes::LOCAL_MATRICES ## global level allocation (i.e. memory for all blocks -> 3rd order tensor) + fes::LOCAL_VECTORS ## global level allocation (i.e. memory for all blocks -> 2nd order tensor) +end + +cellke(alloc::GlobalMemAlloc, e::Ti) where {Ti <: Integer} = @view alloc.Kes[e, :, :] +cellfe(alloc::GlobalMemAlloc, e::Ti) where {Ti <: Integer} = @view alloc.fes[e, :] + + +function allocate_global_mem(::Type{Tv}, n_cells::Ti, n_basefuncs::Ti) where {Ti <: Integer, Tv <: Real} + Kes = CUDA.zeros(Tv, n_cells, n_basefuncs, n_basefuncs) + fes = CUDA.zeros(Tv, n_cells, n_basefuncs) + return GlobalMemAlloc(Kes, fes) +end diff --git a/ext/GPU/cuda_sparsity_pattern.jl b/ext/GPU/cuda_sparsity_pattern.jl new file mode 100644 index 0000000000..d397d96688 --- /dev/null +++ b/ext/GPU/cuda_sparsity_pattern.jl @@ -0,0 +1,6 @@ +function Ferrite.allocate_matrix(::Type{S}, dh::DofHandler, args...; kwargs...) where {Tv, Ti <: Integer, S <: CUSPARSE.CuSparseMatrixCSC{Tv, Ti}} + ## TODO: decide whether create the matrix from the very beginning or just create cpu version then copy + K = allocate_matrix(SparseMatrixCSC{Tv, Ti}, dh, args...; kwargs...) + Kgpu = CUSPARSE.CuSparseMatrixCSC(K) + return Kgpu +end diff --git a/ext/GPU/gpu_assembler.jl b/ext/GPU/gpu_assembler.jl new file mode 100644 index 0000000000..0fde157992 --- /dev/null +++ b/ext/GPU/gpu_assembler.jl @@ -0,0 +1,77 @@ +## GPU Assembler ### +### First abstract types and interfaces ### + +abstract type AbstractGPUSparseAssembler{Tv, Ti} end + +function Ferrite.assemble!(A::AbstractGPUSparseAssembler, dofs::AbstractVector{Int32}, Ke::MATRIX, fe::VECTOR) where {MATRIX, VECTOR} + throw(NotImplementedError("A concrete implementation of assemble! is required")) +end + + +struct GPUAssemblerSparsityPattern{Tv, Ti, VEC_FLOAT <: AbstractVector{Tv}, SPARSE_MAT <: AbstractSparseArray{Tv, Ti}} <: AbstractGPUSparseAssembler{Tv, Ti} + K::SPARSE_MAT + f::VEC_FLOAT +end + +function Ferrite.start_assemble(K::CUSPARSE.CuSparseDeviceMatrixCSC{Tv, Ti}, f::CuDeviceVector{Tv}; fillzero::Bool = false) where {Tv, Ti} + ## `fillzero` is not required in GPU version and should be always set to false, because different threads will + ## instantiate their own instance of assembler, so it it's set to true, the results will be wrong. + ## However, the interface is kept the same for consistency with the CPU version (in CPU multithreading we use the standard assembler). + fillzero && (fillzero!(K); fillzero!(f)) + return GPUAssemblerSparsityPattern(K, f) +end + + +""" + assemble!(A::GPUAssemblerSparsityPattern, dofs::AbstractVector{Int32}, Ke::MATRIX, fe::VECTOR) + +Assembles the global stiffness matrix `Ke` and the global force vector `fe` into the the global stiffness matrix `K` and the global force vector `f` of the `GPUAssemblerSparsityPattern` object `A`. + +""" +@propagate_inbounds function Ferrite.assemble!(A::GPUAssemblerSparsityPattern, dofs::AbstractVector{Int32}, Ke::MATRIX, fe::VECTOR) where {MATRIX, VECTOR} + # Note: MATRIX and VECTOR are cuda dynamic shared memory + return _assemble!(A, dofs, Ke, fe) +end + + +function _assemble!(A::GPUAssemblerSparsityPattern, dofs::AbstractVector{Int32}, Ke::MATRIX, fe::VECTOR) where {MATRIX, VECTOR} + # # Brute force assembly + K = A.K + f = A.f + for i in 1:length(dofs) + ig = dofs[i] + CUDA.@atomic f[ig] += fe[i] + for j in 1:length(dofs) + jg = dofs[j] + # set the value of the global matrix + _add_to_index!(K, Ke[i, j], ig, jg) + end + end + return +end + +@inline function _add_to_index!(K::AbstractSparseArray{Tv, Ti}, v::Tv, i::Int32, j::Int32) where {Tv, Ti} + col_start = K.colPtr[j] + col_end = K.colPtr[j + Int32(1)] - Int32(1) + + for k in col_start:col_end + if K.rowVal[k] == i + # Update the existing element + CUDA.@atomic K.nzVal[k] += v + return + end + end + return +end + + +### Array utils ### +function fillzero!(A::CuDeviceVector{Tv}) where {Tv} + CUDA.fill!(A, zero(Tv)) + return A +end + +function fillzero!(A::CUSPARSE.CuSparseDeviceMatrixCSC{Tv, Ti}) where {Tv, Ti} + CUDA.fill!(A.nzVal, zero(Tv)) + return A +end diff --git a/heatflow_qp_values.jl b/heatflow_qp_values.jl new file mode 100644 index 0000000000..e6de08232f --- /dev/null +++ b/heatflow_qp_values.jl @@ -0,0 +1,177 @@ +using Ferrite + +# Standard element routine +function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues) + n_basefuncs = getnbasefunctions(cellvalues) + ## Loop over quadrature points + for q_point in 1:getnquadpoints(cellvalues) + ## Get the quadrature weight + dΩ = getdetJdV(cellvalues, q_point) + ## Loop over test shape functions + for i in 1:n_basefuncs + δu = shape_value(cellvalues, q_point, i) + ∇δu = shape_gradient(cellvalues, q_point, i) + ## Add contribution to fe + fe[i] += δu * dΩ + ## Loop over trial shape functions + for j in 1:n_basefuncs + ∇u = shape_gradient(cellvalues, q_point, j) + ## Add contribution to Ke + Ke[i, j] += (∇δu ⋅ ∇u) * dΩ + end + end + end + return Ke, fe +end + +# Element routine using QuadratureValuesIterator +function assemble_element_qpiter!(Ke::Matrix, fe::Vector, cellvalues) + n_basefuncs = getnbasefunctions(cellvalues) + ## Loop over quadrature points + for qv in Ferrite.QuadratureValuesIterator(cellvalues) + ## Get the quadrature weight + dΩ = getdetJdV(qv) + ## Loop over test shape functions + for i in 1:n_basefuncs + δu = shape_value(qv, i) + ∇δu = shape_gradient(qv, i) + ## Add contribution to fe + fe[i] += δu * dΩ + ## Loop over trial shape functions + for j in 1:n_basefuncs + ∇u = shape_gradient(qv, j) + ## Add contribution to Ke + Ke[i, j] += (∇δu ⋅ ∇u) * dΩ + end + end + end + return Ke, fe +end + +function assemble_element_qpiter!(Ke::Matrix, fe::Vector, cellvalues, cell_coords::AbstractVector) + n_basefuncs = getnbasefunctions(cellvalues) + ## Loop over quadrature points + for qv in Ferrite.QuadratureValuesIterator(cellvalues, cell_coords) + ## Get the quadrature weight + dΩ = getdetJdV(qv) + ## Loop over test shape functions + for i in 1:n_basefuncs + δu = shape_value(qv, i) + ∇δu = shape_gradient(qv, i) + ## Add contribution to fe + fe[i] += δu * dΩ + ## Loop over trial shape functions + for j in 1:n_basefuncs + ∇u = shape_gradient(qv, j) + ## Add contribution to Ke + Ke[i, j] += (∇δu ⋅ ∇u) * dΩ + end + end + end + return Ke, fe +end + +function assemble_global(cellvalues, dh; kwargs...) + return assemble_global!(create_buffers(cellvalues, dh), cellvalues, dh; kwargs...) +end + +function assemble_global!(buffer, cellvalues, dh::DofHandler; qp_iter::Val{QPiter}, reinit::Val{ReInit}) where {QPiter, ReInit} + (; f, K, assembler, Ke, fe) = buffer + for cell in CellIterator(dh) + fill!(Ke, 0) + fill!(fe, 0) + if QPiter + if ReInit + reinit!(cellvalues, getcoordinates(cell)) + assemble_element_qpiter!(Ke, fe, cellvalues) + else + assemble_element_qpiter!(Ke, fe, cellvalues, getcoordinates(cell)) + end + else + reinit!(cellvalues, getcoordinates(cell)) + assemble_element_std!(Ke, fe, cellvalues) + end + assemble!(assembler, celldofs(cell), Ke, fe) + end + return K, f +end + +function create_buffers(cellvalues, dh) + f = zeros(ndofs(dh)) + K = create_sparsity_pattern(dh) + assembler = start_assemble(K, f) + ## Local quantities + n_basefuncs = getnbasefunctions(cellvalues) + Ke = zeros(n_basefuncs, n_basefuncs) + fe = zeros(n_basefuncs) + return (; f, K, assembler, Ke, fe) +end + +n = 50 +grid = generate_grid(Quadrilateral, (n, n)); +ip = Lagrange{RefQuadrilateral, 1}() +qr = QuadratureRule{RefQuadrilateral}(2) + +dh = DofHandler(grid) + +add!(dh, :u, ip) +close!(dh); + +cellvalues = CellValues(qr, ip); + +static_cellvalues = Ferrite.StaticCellValues(cellvalues) + +stdassy(buffer, cv, dh) = assemble_global!(buffer, cv, dh; qp_iter = Val(false), reinit = Val(false)) +qp_outside(buffer, cv, dh) = assemble_global!(buffer, cv, dh; qp_iter = Val(true), reinit = Val(true)) +qp_inside(buffer, cv, dh) = assemble_global!(buffer, cv, dh; qp_iter = Val(true), reinit = Val(false)) + +Kstd, fstd = stdassy(create_buffers(cellvalues, dh), cellvalues, dh); +using LinearAlgebra +norm(Kstd) +K_qp_o, f_qp_o = qp_outside(create_buffers(cellvalues, dh), cellvalues, dh); +norm(K_qp_o) +K_qp_i, f_qp_i = qp_inside(create_buffers(cellvalues, dh), cellvalues, dh); +norm(K_qp_i) +cvs_o = Ferrite.StaticCellValues(cellvalues, Val(true)) # Save cell_coords in cvs_o +Ks_o, fs_o = qp_outside(create_buffers(cvs_o, dh), cvs_o, dh); +norm(Ks_o) +cvs_i = Ferrite.StaticCellValues(cellvalues, Val(false)) # Don't save cell_coords in cvs_o +Ks_i, fs_i = qp_inside(create_buffers(cvs_i, dh), cvs_i, dh); +norm(Ks_i) +using Test +@testset "check outputs" begin + for (k, K, f) in (("qpo", K_qp_o, f_qp_o), ("qpi", K_qp_i, f_qp_i), ("so", Ks_o, fs_o), ("si", Ks_i, fs_i)) + @testset "$k" begin + @test K ≈ Kstd + @test f ≈ fstd + end + end +end + +# Benchmarking +using BenchmarkTools +if n ≤ 100 + print("Standard: ") + @btime stdassy(buffer, $cellvalues, $dh) setup = (buffer = create_buffers(cellvalues, dh)) + print("Std qpoint outside: ") + @btime qp_outside(buffer, $cellvalues, $dh) setup = (buffer = create_buffers(cellvalues, dh)) + print("Std qpoint inside: ") + @btime qp_inside(buffer, $cellvalues, $dh) setup = (buffer = create_buffers(cellvalues, dh)) + print("Static outside: ") + @btime qp_outside(buffer, $cvs_o, $dh) setup = (buffer = create_buffers(cvs_o, dh)) + print("Static inside: ") + @btime qp_inside(buffer, $cvs_i, $dh) setup = (buffer = create_buffers(cvs_i, dh)) +else + buffer = create_buffers(cellvalues, dh) + print("Standard: ") + @time stdassy(buffer, cellvalues, dh) + print("Std qpoint outside: ") + @time qp_outside(buffer, cellvalues, dh) + print("Std qpoint inside: ") + @time qp_inside(buffer, cellvalues, dh) + print("Static outside: ") + @time qp_outside(buffer, cvs_o, dh) + print("Static inside: ") + @time qp_inside(buffer, cvs_i, dh) +end +nothing diff --git a/src/FEValues/QuadratureValues.jl b/src/FEValues/QuadratureValues.jl new file mode 100644 index 0000000000..888dc3f3d0 --- /dev/null +++ b/src/FEValues/QuadratureValues.jl @@ -0,0 +1,141 @@ +# QuadratureValuesIterator +struct QuadratureValuesIterator{VT, XT} + v::VT + cell_coords::XT # Union{AbstractArray{<:Vec}, Nothing} + function QuadratureValuesIterator(v::V) where {V} + return new{V, Nothing}(v, nothing) + end + function QuadratureValuesIterator(v::V, cell_coords::VT) where {V, VT <: AbstractArray} + #reinit!(v, cell_coords) # Why we need that ? + return new{V, VT}(v, cell_coords) + end +end + +function Base.iterate(iterator::QuadratureValuesIterator{<:Any, Nothing}, q_point = 1) + checkbounds(Bool, 1:getnquadpoints(iterator.v), q_point) || return nothing + qp_v = @inbounds quadrature_point_values(iterator.v, q_point) + return (qp_v, q_point + 1) +end + +function Base.iterate(iterator::QuadratureValuesIterator{<:Any, <:AbstractVector}, q_point = 1) + checkbounds(Bool, 1:getnquadpoints(iterator.v), q_point) || return nothing + #q_point < 5 || return nothing + qp_v = @inbounds quadrature_point_values(iterator.v, q_point, iterator.cell_coords) + return (qp_v, q_point + 1) + #return (1, q_point+1) +end +Base.IteratorEltype(::Type{<:QuadratureValuesIterator}) = Base.EltypeUnknown() +Base.length(iterator::QuadratureValuesIterator) = getnquadpoints(iterator.v) + +# AbstractQuadratureValues +abstract type AbstractQuadratureValues end + +function function_value(qp_v::AbstractQuadratureValues, u::AbstractVector, dof_range = eachindex(u)) + n_base_funcs = getnbasefunctions(qp_v) + length(dof_range) == n_base_funcs || throw_incompatible_dof_length(length(dof_range), n_base_funcs) + @boundscheck checkbounds(u, dof_range) + val = function_value_init(qp_v, u) + @inbounds for (i, j) in pairs(dof_range) + val += shape_value(qp_v, i) * u[j] + end + return val +end + +function function_gradient(qp_v::AbstractQuadratureValues, u::AbstractVector, dof_range = eachindex(u)) + n_base_funcs = getnbasefunctions(qp_v) + length(dof_range) == n_base_funcs || throw_incompatible_dof_length(length(dof_range), n_base_funcs) + @boundscheck checkbounds(u, dof_range) + grad = function_gradient_init(qp_v, u) + @inbounds for (i, j) in pairs(dof_range) + grad += shape_gradient(qp_v, i) * u[j] + end + return grad +end + +function function_symmetric_gradient(qp_v::AbstractQuadratureValues, u::AbstractVector, dof_range) + grad = function_gradient(qp_v, u, dof_range) + return symmetric(grad) +end + +function function_symmetric_gradient(qp_v::AbstractQuadratureValues, u::AbstractVector) + grad = function_gradient(qp_v, u) + return symmetric(grad) +end + +function function_divergence(qp_v::AbstractQuadratureValues, u::AbstractVector, dof_range = eachindex(u)) + return divergence_from_gradient(function_gradient(qp_v, u, dof_range)) +end + +function function_curl(qp_v::AbstractQuadratureValues, u::AbstractVector, dof_range = eachindex(u)) + return curl_from_gradient(function_gradient(qp_v, u, dof_range)) +end + +function spatial_coordinate(qp_v::AbstractQuadratureValues, x::AbstractVector{<:Vec}) + n_base_funcs = getngeobasefunctions(qp_v) + length(x) == n_base_funcs || throw_incompatible_coord_length(length(x), n_base_funcs) + vec = zero(eltype(x)) + @inbounds for i in 1:n_base_funcs + vec += geometric_value(qp_v, i) * x[i] + end + return vec +end + +# Specific design for QuadratureValues <: AbstractQuadratureValues +# which contains standard AbstractValues +struct QuadratureValues{VT <: AbstractValues} <: AbstractQuadratureValues + v::VT + q_point::Int + Base.@propagate_inbounds function QuadratureValues(v::AbstractValues, q_point::Int) + @boundscheck checkbounds(1:getnbasefunctions(v), q_point) + return new{typeof(v)}(v, q_point) + end +end + +@inline quadrature_point_values(fe_v::AbstractValues, q_point, args...) = QuadratureValues(fe_v, q_point) + +@propagate_inbounds getngeobasefunctions(qv::QuadratureValues) = getngeobasefunctions(qv.v) +@propagate_inbounds geometric_value(qv::QuadratureValues, i) = geometric_value(qv.v, qv.q_point, i) +geometric_interpolation(qv::QuadratureValues) = geometric_interpolation(qv.v) + +getdetJdV(qv::QuadratureValues) = @inbounds getdetJdV(qv.v, qv.q_point) + +# Accessors for function values +getnbasefunctions(qv::QuadratureValues) = getnbasefunctions(qv.v) +function_interpolation(qv::QuadratureValues) = function_interpolation(qv.v) +function_difforder(qv::QuadratureValues) = function_difforder(qv.v) +shape_value_type(qv::QuadratureValues) = shape_value_type(qv.v) +shape_gradient_type(qv::QuadratureValues) = shape_gradient_type(qv.v) + +@propagate_inbounds shape_value(qv::QuadratureValues, i::Int) = shape_value(qv.v, qv.q_point, i) +@propagate_inbounds shape_gradient(qv::QuadratureValues, i::Int) = shape_gradient(qv.v, qv.q_point, i) +@propagate_inbounds shape_symmetric_gradient(qv::QuadratureValues, i::Int) = shape_symmetric_gradient(qv.v, qv.q_point, i) + + +#= Proposed syntax, for heatflow in general +function assemble_element!(Ke::Matrix, fe::Vector, cellvalues) + n_basefuncs = getnbasefunctions(cellvalues) + for qv in Ferrite.QuadratureValuesIterator(cellvalues) + dΩ = getdetJdV(qv) + for i in 1:n_basefuncs + δu = shape_value(qv, i) + ∇δu = shape_gradient(qv, i) + fe[i] += δu * dΩ + for j in 1:n_basefuncs + ∇u = shape_gradient(qv, j) + Ke[i, j] += (∇δu ⋅ ∇u) * dΩ + end + end + end + return Ke, fe +end + +Where the default for a QuadratureValuesIterator would be to return a +`QuadratureValues` as above, but custom `AbstractValues` can be created where +for example the element type would be a static QuadPointValue type which doesn't +use heap allocated buffers, e.g. by only saving the cell and coordinates during reinit, +and then calculating all values for each element in the iterator. + +References: +https://github.com/termi-official/Thunderbolt.jl/pull/53/files#diff-2b486be5a947c02ef2a38ff3f82af3141193af0b6f01ed9d5129b914ed1d84f6 +https://github.com/Ferrite-FEM/Ferrite.jl/compare/master...kam/StaticValues2 +=# diff --git a/src/FEValues/StaticCellValues.jl b/src/FEValues/StaticCellValues.jl new file mode 100644 index 0000000000..5ce13996d3 --- /dev/null +++ b/src/FEValues/StaticCellValues.jl @@ -0,0 +1,165 @@ +struct StaticQuadratureValues{T, N_t, dNdx_t, M_t, NumN, NumM} <: AbstractQuadratureValues + detJdV::T + N::SVector{NumN, N_t} + dNdx::SVector{NumN, dNdx_t} + M::SVector{NumM, M_t} +end + +@propagate_inbounds getngeobasefunctions(qv::StaticQuadratureValues) = length(qv.M) +@propagate_inbounds geometric_value(qv::StaticQuadratureValues, i) = qv.M[i] +# geometric_interpolation(qv::StaticQuadratureValues) = geometric_interpolation(qv.v) # Not included + +getdetJdV(qv::StaticQuadratureValues) = qv.detJdV + +# Accessors for function values +getnbasefunctions(qv::StaticQuadratureValues) = length(qv.N) +# function_interpolation(qv::StaticQuadratureValues) = function_interpolation(qv.v) # Not included +shape_value_type(::StaticQuadratureValues{<:Any, N_t}) where {N_t} = N_t +shape_gradient_type(::StaticQuadratureValues{<:Any, <:Any, dNdx_t}) where {dNdx_t} = dNdx_t + +@propagate_inbounds shape_value(qv::StaticQuadratureValues, i::Int) = qv.N[i] +@propagate_inbounds shape_gradient(qv::StaticQuadratureValues, i::Int) = qv.dNdx[i] +@propagate_inbounds shape_gradient(qv::StaticQuadratureValues, i::Int32) = qv.dNdx[i] # Needed for GPU (threadIdx is Int32), otherwise it will throw a dynamic function invokation error +@propagate_inbounds shape_symmetric_gradient(qv::StaticQuadratureValues, i::Int) = symmetric(qv.dNdx[i]) + +@propagate_inbounds geometric_value(qv::StaticQuadratureValues, i::Int) = qv.M[i] + +# StaticInterpolationValues: interpolation and precalculated values for all quadrature points +# Can be both for function and geometric shape functions. +# DiffOrder parameter? +# TODO: Could perhaps denote this just InterpolationValues and replace GeometryMapping +# Just need to make Nξ::AbstractMatrix instead as in GeometryMapping to make it equivalent (except fieldnames) +struct StaticInterpolationValues{IP, N, Nqp, N_et, dNdξ_t, Nall} + ip::IP + Nξ::SMatrix{N, Nqp, N_et, Nall} + dNdξ::dNdξ_t # Union{SMatrix{N, Nqp}, Nothing} + #dN2dξ2::dN2dξ2_t # Union{SMatrix{N, Nqp}, Nothing} +end +function StaticInterpolationValues(fv::FunctionValues) + N = getnbasefunctions(fv.ip) + Nq = size(fv.Nξ, 2) + Nξ = SMatrix{N, Nq}(fv.Nξ) + dNdξ = SMatrix{N, Nq}(fv.dNdξ) + return StaticInterpolationValues(fv.ip, Nξ, dNdξ) +end +function StaticInterpolationValues(gm::GeometryMapping) + N = getnbasefunctions(gm.ip) + Nq = size(gm.M, 2) + M = SMatrix{N, Nq}(gm.M) + dMdξ = SMatrix{N, Nq}(gm.dMdξ) + return StaticInterpolationValues(gm.ip, M, dMdξ) +end + +getnbasefunctions(siv::StaticInterpolationValues) = getnbasefunctions(siv.ip) + +# Dispatch on DiffOrder parameter? +# Reuse functions for GeometryMapping - same signature but need access functions +# Or merge GeometryMapping and StaticInterpolationValues => InterpolationValues +@propagate_inbounds @inline function calculate_mapping(ip_values::StaticInterpolationValues{<:Any, N}, q_point, x) where {N} + fecv_J = zero(otimes_returntype(eltype(x), eltype(ip_values.dNdξ))) + @inbounds for j in 1:N + #fecv_J += x[j] ⊗ geo_mapping.dMdξ[j, q_point] + fecv_J += otimes_helper(x[j], ip_values.dNdξ[j, q_point]) + end + return MappingValues(fecv_J, nothing) +end + +@propagate_inbounds @inline function calculate_mapped_values(funvals::StaticInterpolationValues, q_point, mapping_values, args...) + return calculate_mapped_values(funvals, mapping_type(funvals.ip), q_point, mapping_values, args...) +end + +@propagate_inbounds @inline function calculate_mapped_values(funvals::StaticInterpolationValues, ::IdentityMapping, q_point, mapping_values, args...) + Jinv = calculate_Jinv(getjacobian(mapping_values)) + Nx = funvals.Nξ[:, q_point] + dNdx = map(dNdξ -> dothelper(dNdξ, Jinv), funvals.dNdξ[:, q_point]) + return Nx, dNdx +end + +struct StaticCellValues{FV, GM, Nqp, T} + fv::FV # StaticInterpolationValues + gm::GM # StaticInterpolationValues + #x::Tx # AbstractVector{<:Vec} or Nothing + weights::NTuple{Nqp, T} +end +function StaticCellValues(cv::CellValues) + fv = StaticInterpolationValues(cv.fun_values) + gm = StaticInterpolationValues(cv.geo_mapping) + sdim = sdim_from_gradtype(shape_gradient_type(cv)) + #x = SaveCoords ? fill(zero(Vec{sdim}), getngeobasefunctions(cv)) : nothing + weights = ntuple(i -> getweights(cv.qr)[i], getnquadpoints(cv)) + return StaticCellValues(fv, gm, weights) +end + +getnquadpoints(cv::StaticCellValues) = length(cv.weights) +getnbasefunctions(cv::StaticCellValues) = getnbasefunctions(cv.fv) +getngeobasefunctions(cv::StaticCellValues) = getnbasefunctions(cv.gm) + +@inline function reinit!(cv::StaticCellValues{<:Any, <:Any, <:AbstractVector}, cell_coords::AbstractVector) + return copyto!(cv.x, cell_coords) + #TODO: Also allow the cell::AbstracCell to be given and updated +end +@inline function reinit!(::StaticCellValues{<:Any, <:Any, Nothing}, ::AbstractVector) + return nothing # Nothing to do on reinit if x is not saved. +end + +@inline function quadrature_point_values(fe_v::StaticCellValues{<:Any, <:Any, <:AbstractVector}, q_point::Int) + return _quadrature_point_values(fe_v, q_point, fe_v.x, detJ -> throw_detJ_not_pos(detJ)) +end + +@inline function quadrature_point_values(fe_v::StaticCellValues{<:Any, <:Any}, q_point::Int, cell_coords::AbstractVector) + return _quadrature_point_values(fe_v, q_point, cell_coords, detJ -> throw_detJ_not_pos(detJ)) +end + +@inline function quadrature_point_values(fe_v::StaticCellValues{<:Any, <:Any}, q_point::Int, cell_coords::StaticVector) + return _quadrature_point_values(fe_v, q_point, cell_coords, detJ -> -1) +end + + +function _quadrature_point_values(fe_v::StaticCellValues, q_point::Int, cell_coords::AbstractVector, neg_detJ_err_fun::Function) + #q_point bounds checked, ok to use @inbounds + @inbounds begin + mapping = calculate_mapping(fe_v.gm, q_point, cell_coords) + + detJ = calculate_detJ(getjacobian(mapping)) + detJ > 0.0f0 || neg_detJ_err_fun(detJ) # Cannot throw error on GPU, TODO: return error code instead + detJdV = detJ * fe_v.weights[q_point] + + Nx, dNdx = calculate_mapped_values(fe_v.fv, q_point, mapping) + M = fe_v.gm.Nξ[:, q_point] + end + return StaticQuadratureValues(detJdV, Nx, dNdx, M) +end + + +## New Implementation for StaticQuadratureValues to suit the GPU code +struct StaticQuadratureView{JT, HT, N_Nodes, NODEVEC} <: AbstractQuadratureValues + mapping::MappingValues{JT, HT} + cell_coords::SVector{N_Nodes, NODEVEC} + q_point::Int32 + cv::StaticCellValues +end + + +@inline function quadrature_point_values(fe_v::StaticCellValues{<:Any, <:Any}, q_point::Int32, cell_coords::SVector) + @inbounds begin + mapping = calculate_mapping(fe_v.gm, q_point, cell_coords) + end + return StaticQuadratureView(mapping, cell_coords, q_point, fe_v) +end + + +@inline function getdetJdV(qv::StaticQuadratureView) + detJ = Float32(calculate_detJ(getjacobian(qv.mapping))) + detJ > 0.0f0 || -1.0f0 # Cannot throw error on GPU, TODO: return error code instead + detJdV = detJ * qv.cv.weights[qv.q_point] + return detJdV +end + +@inline function shape_gradient(qv::StaticQuadratureView, i::Int32) + @inbounds begin + Jinv = calculate_Jinv(getjacobian(qv.mapping)) + #Nx = qv.cv.fv.Nξ[i, q_point] + dNdx = dothelper(qv.cv.fv.dNdξ[i, qv.q_point], Jinv) + return dNdx + end +end diff --git a/src/Ferrite.jl b/src/Ferrite.jl index 7b358463be..1812851c1d 100644 --- a/src/Ferrite.jl +++ b/src/Ferrite.jl @@ -16,7 +16,7 @@ using OrderedCollections: using SparseArrays: SparseArrays, SparseMatrixCSC, nonzeros, nzrange, rowvals, AbstractSparseMatrixCSC using StaticArrays: - StaticArrays, MArray, MMatrix, SArray, SMatrix, SVector + StaticVector, StaticArrays, MVector, MArray, MMatrix, SArray, SMatrix, SVector, @SVector using WriteVTK: WriteVTK, VTKCellTypes using Tensors: @@ -130,6 +130,8 @@ include("FEValues/CellValues.jl") include("FEValues/FacetValues.jl") include("FEValues/InterfaceValues.jl") include("FEValues/PointValues.jl") +include("FEValues/QuadratureValues.jl") +include("FEValues/StaticCellValues.jl") include("FEValues/common_values.jl") include("FEValues/face_integrals.jl") @@ -166,4 +168,14 @@ include("PointEvalHandler.jl") include("deprecations.jl") include("docs.jl") +# GPU support +include("GPU/coloring_dof.jl") +include("GPU/KernelLauncher.jl") +include("GPU/CPUKernelLauncher.jl") +include("GPU/gpu_grid.jl") +include("GPU/GPUDofHandler.jl") +include("GPU/parallel_iterator.jl") +include("GPU/mem_alloc.jl") + + end # module diff --git a/src/GPU/CPUKernelLauncher.jl b/src/GPU/CPUKernelLauncher.jl new file mode 100644 index 0000000000..1a3f92ad00 --- /dev/null +++ b/src/GPU/CPUKernelLauncher.jl @@ -0,0 +1,66 @@ +abstract type AbstractCPUKernel <: AbstractKernel{BackendCPU} end + + +struct CPUKernel{Ti} <: AbstractCPUKernel + n_cells::Ti # Number of cells + n_basefuncs::Ti # Number of base functions + kernel::Function # Kernel function to execute + args::Tuple # Arguments for the kernel function + n_colors::Ti + dh::ColoringDofHandler +end + +function init_kernel(::Type{BackendCPU}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer} + args, color_dh = _to_colordh(args) # convert the dofhandler to color colordofhandler + no_colors = ncolors(color_dh) + return CPUKernel(n_cells, n_basefuncs, kernel, args, no_colors, color_dh) +end + +function launch!(kernel::CPUKernel{Ti}) where {Ti} + ker = kernel.kernel + args = kernel.args + ## Naive implementation to circumvent the issue with cellvalues + ## on GPU the we are using the static version of cellvalues because it's immutable + ## so in order to unify the parallel kernel interface we need (for now) to use the static version + ## without changing the routine, so basically we search for any cellvalues passed in the args and + ## convert it to the static version + cell_index = findfirst(x -> x isa CellValues, args) + (cell_index === nothing) || (args = _update_cell_args(args, cell_index)) + color_dh = kernel.dh + no_colors = kernel.n_colors + nthreads = Threads.nthreads() + for i in 1:no_colors + current_color!(color_dh, i) + Threads.@threads :static for j in 1:nthreads + ker(args...) + end + end + return +end + +(ker::CPUKernel)() = launch!(ker) + +function _to_colordh(args::Tuple) + dh_index = findfirst(x -> x isa AbstractDofHandler, args) + dh_index !== nothing || throw(ErrorException("No subtype of AbstractDofHandler found in the arguments")) + arr = args |> collect + color_dh = init_colordh(arr[dh_index]) + arr[dh_index] = color_dh + return Tuple(arr), color_dh +end + +function _update_cell_args(args::Tuple, index::Int) + ## since tuples are immutable we need to convert it to an array to update the values + ## then convert it back to a tuple + arr = args |> collect + arr[index] = _to_static_cellvalues(arr[index]) + return Tuple(arr) +end + + +function _to_static_cellvalues(cv::CellValues) + fv = StaticInterpolationValues(cv.fun_values) + gm = StaticInterpolationValues(cv.geo_mapping) + weights = ntuple(i -> getweights(cv.qr)[i], getnquadpoints(cv)) + return StaticCellValues(fv, gm, weights) +end diff --git a/src/GPU/GPUDofHandler.jl b/src/GPU/GPUDofHandler.jl new file mode 100644 index 0000000000..dbfe8d1217 --- /dev/null +++ b/src/GPU/GPUDofHandler.jl @@ -0,0 +1,83 @@ +# This file defines the `GPUDofHandler` type, which is a degree of freedom handler that is stored on the GPU. +# Therefore most of the functions are same as the ones defined in dof_handler.jl, but executable on the GPU. + +""" + AbstractGPUDofHandler <: Ferrite.AbstractDofHandler + +Abstract type representing degree-of-freedom (DoF) handlers for GPU-based +finite element computations. This serves as the base type for GPU-specific +DoF handler implementations. +""" +abstract type AbstractGPUDofHandler <: AbstractDofHandler end + +struct GPUSubDofHandler{VEC_INT, Ti, VEC_IP} <: AbstractGPUDofHandler + cellset::VEC_INT + field_names::VEC_INT # cannot use symbols in GPU + field_interpolations::VEC_IP + ndofs_per_cell::Ti +end + +## IDEA: to have multiple interfaces for dofhandlers (e.g. one domain dofhandler, multiple subdomains) +struct GPUDofHandler{SUB_DOFS <: AbstractArray{<:AbstractGPUDofHandler, 1}, CDOFS <: AbstractArray{<:Number, 1}, VEC_INT <: AbstractArray{Int32, 1}, GRID <: AbstractGrid} <: AbstractGPUDofHandler + subdofhandlers::SUB_DOFS + cell_dofs::CDOFS + grid::GRID + cell_dofs_offset::VEC_INT + cell_to_subdofhandler::VEC_INT +end + + +function ndofs_per_cell(dh::GPUDofHandler, cell::Ti) where {Ti <: Integer} + sdhidx = dh.cell_to_subdofhandler[cell] + sdhidx ∉ 1:length(dh.subdofhandlers) && return 0 # Dof handler is just defined on a subdomain + return ndofs_per_cell(dh.subdofhandlers[sdhidx]) +end +ndofs_per_cell(sdh::GPUSubDofHandler) = sdh.ndofs_per_cell +cell_dof_offset(dh::GPUDofHandler, i::Int32) = dh.cell_dofs_offset[i] +get_grid(dh::GPUDofHandler) = dh.grid + +""" + celldofs(dh::GPUDofHandler, i::Int32) + +Return the cell degrees of freedom (DoFs) for the given cell index `i` in the +`GPUDofHandler`. + +# Arguments +- `dh`: A `GPUDofHandler` instance. +- `i::Int32`: The index of the cell. + +# Returns +A `SubArray` (view) representing the DoFs for the specified cell. +""" +function celldofs(dh::GPUDofHandler, i::Int32) + offset = cell_dof_offset(dh, i) + ndofs = ndofs_per_cell(dh, i) + view = @view dh.cell_dofs[offset:(offset + ndofs - Int32(1))] + return view +end + + +# TODO: Delete all below this line +""" + LocalsGPUDofHandler{DH, LOCAL_MATRICES, LOCAL_VECTORS} <: AbstractGPUDofHandler + +This object acts as a temporary data structure for storing local stiffness matrices and force vectors, when +dynamic shared memory doesn't have enough space, to be used in GPU kernel by GPU cell iterators. + +# Fields +- `dh::DH`: Base DoF handler (e.g., `GPUDofHandler`). +- `Kes::LOCAL_MATRICES`: Local stiffness matrices for each cell (3rd order tensor). +- `fes::LOCAL_VECTORS`: Local force vectors for each cell (2nd order tensor). +""" +struct LocalsGPUDofHandler{DH <: AbstractDofHandler, LOCAL_MATRICES, LOCAL_VECTORS} <: AbstractGPUDofHandler + dh::DH + Kes::LOCAL_MATRICES + fes::LOCAL_VECTORS +end + +# Accessor functions for LocalsGPUDofHandler +dofhandler(dh::LocalsGPUDofHandler) = dh.dh +localkes(dh::LocalsGPUDofHandler) = dh.Kes +localfes(dh::LocalsGPUDofHandler) = dh.fes +cellke(dh::LocalsGPUDofHandler, i::Int32) = @view dh.Kes[i, :, :] +cellfe(dh::LocalsGPUDofHandler, i::Int32) = @view dh.fes[i, :] diff --git a/src/GPU/KernelLauncher.jl b/src/GPU/KernelLauncher.jl new file mode 100644 index 0000000000..578d8c574f --- /dev/null +++ b/src/GPU/KernelLauncher.jl @@ -0,0 +1,56 @@ +#= +This file defines the interface between the GPU backend (extension) and the Ferrite package. +It provides abstract types, function signatures, and concrete types for managing GPU kernels +and backends, serving as a foundation for GPU-accelerated computations. +=# + +### Abstract Types ### +abstract type AbstractBackend end +abstract type AbstractKernel{BKD <: AbstractBackend} end + + +### Functions ### + +""" + init_gpu_kernel(backend::AbstractGPUBackend, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer} + +Initializes a GPU kernel with the specified backend, number of cells, base functions, +kernel function, and additional arguments. + +# Arguments +- `backend::AbstractGPUBackend`: The GPU backend to use for kernel execution. +- `n_cells::Ti`: Number of cells to be processed by the kernel. +- `n_basefuncs::Ti`: Number of base functions for each cell. +- `kernel::Function`: The kernel function to execute on the GPU. +- `args::Tuple`: Additional arguments required by the kernel. + +# Notes +This function needs to be implemented for each specific backend. Calling this function +without a concrete implementation will raise an error. +""" +function init_kernel(backend::AbstractBackend, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer} + throw(ErrorException("A concrete implementation of init_gpu_kernel is required")) +end + +""" + launch!(kernel::AbstractGPUKernel) + +Launches a GPU kernel using the specified backend. This interface provides a generic +mechanism for running GPU-accelerated computations across different GPU backends. + +# Arguments +- `::AbstractGPUKernel`: The GPU kernel to be launched. + +# Notes +This function must be implemented for specific GPU kernels. If not implemented, +an error will be thrown. +""" +function launch!(::AbstractKernel) + throw(ErrorException("A concrete implementation of launch! is required")) +end + +(ker::AbstractKernel)() = launch!(ker) + +### GPU Backend ### +struct BackendCUDA <: AbstractBackend end +struct BackendCPU <: AbstractBackend end diff --git a/src/GPU/coloring_dof.jl b/src/GPU/coloring_dof.jl new file mode 100644 index 0000000000..122a9313e6 --- /dev/null +++ b/src/GPU/coloring_dof.jl @@ -0,0 +1,27 @@ +""" + ColoringDofHandler{Ti<:Integer,VECS<:Vector{Vector{Ti}},DH<:AbstractDofHandler} + +A mutable struct that encapsulates a DofHandler and different colors for the cells, in order to be used +in CPU multithreading scheme. +""" +mutable struct ColoringDofHandler{Ti <: Integer, VECS <: Vector{Vector{Ti}}, DH <: AbstractDofHandler} + dh::DH + colors::VECS + current_color::Ti +end + + +function init_colordh(dh::AbstractDofHandler) + grid = get_grid(dh) + colors = create_coloring(grid) + return ColoringDofHandler(dh, colors, 0) +end + + +## Accessors ## +dofhandler(cd::ColoringDofHandler) = cd.dh +colors(cd::ColoringDofHandler) = cd.colors +eles_in_color(cd::ColoringDofHandler, color::Ti) where {Ti <: Integer} = cd.colors[color] +current_color(cd::ColoringDofHandler) = cd.current_color +current_color!(cd::ColoringDofHandler, color::Ti) where {Ti <: Integer} = (cd.current_color = color) +ncolors(cd::ColoringDofHandler) = cd |> colors |> length diff --git a/src/GPU/gpu_grid.jl b/src/GPU/gpu_grid.jl new file mode 100644 index 0000000000..5139864983 --- /dev/null +++ b/src/GPU/gpu_grid.jl @@ -0,0 +1,45 @@ +# This file defines the GPUGrid type, which is a grid that is stored on the GPU. Therefore most of the +# functions are same as the ones defined in grid.jl, but executable on the GPU. + +abstract type AbstractGPUGrid{dim} <: AbstractGrid{dim} end + +struct GPUGrid{dim, CELLVEC <: AbstractArray, NODEVEC <: AbstractArray} <: AbstractGPUGrid{dim} + cells::CELLVEC + nodes::NODEVEC +end + +function GPUGrid( + cells::CELLVEC, + nodes::NODEVEC + ) where {C <: AbstractCell, CELLVEC <: AbstractArray{C, 1}, NODEVEC <: AbstractArray{Node{dim, T}}} where {dim, T} + return GPUGrid{dim, CELLVEC, NODEVEC}(cells, nodes) +end + +get_coordinate_type(::GPUGrid{dim, CELLVEC, NODEVEC}) where +{C <: AbstractCell, CELLVEC <: AbstractArray{C, 1}, NODEVEC <: AbstractArray{Node{dim, T}}} where +{dim, T} = Vec{dim, T} # Node is baked into the mesh type. + + +# Note: For functions that takes blockIdx as an argument, we need to use Int32 explicitly, +# otherwise the compiler will not be able to infer the type of the argument and throw a dynamic function invokation error. +@inline getcells(grid::GPUGrid, v::Union{Int32, Vector{Int32}}) = grid.cells[v] +@inline getnodes(grid::GPUGrid, v::Int32) = grid.nodes[v] + +""" + getcoordinates(grid::Ferrite.GPUGrid,e::Int32) + +Return the coordinates of the nodes of the element `e` in the `GPUGrid` `grid`. +""" +function getcoordinates(grid::GPUGrid, e::Int32) + # e is the element index. + CT = get_coordinate_type(grid) + cell = getcells(grid, e) + N = nnodes(cell) + x = MVector{N, CT}(undef) # local array to store the coordinates of the nodes of the cell. + node_ids = get_node_ids(cell) + for i in 1:length(x) + x[i] = get_node_coordinate(grid, node_ids[i]) + end + + return SVector(x...) +end diff --git a/src/GPU/mem_alloc.jl b/src/GPU/mem_alloc.jl new file mode 100644 index 0000000000..52cd136c18 --- /dev/null +++ b/src/GPU/mem_alloc.jl @@ -0,0 +1,4 @@ +## This file manfists an interface for memory allocation on GPU that +## is used by local stiffness matrix and force vector assembly. + +abstract type AbstractMemAlloc end diff --git a/src/GPU/parallel_iterator.jl b/src/GPU/parallel_iterator.jl new file mode 100644 index 0000000000..a2fe261b87 --- /dev/null +++ b/src/GPU/parallel_iterator.jl @@ -0,0 +1,153 @@ +# This files defines the abstract types and interfaces for GPU iterators. +# The concrete implementations are defined in the extension. + +# abstract types and interfaces +abstract type AbstractIterator end +abstract type AbstractCellCache end + +abstract type AbstractKernelCellCache <: AbstractCellCache end +abstract type AbstractKernelCellIterator <: AbstractIterator end + +@inline function cellke(::AbstractKernelCellCache) + throw(ArgumentError("cellke should be implemented in the derived type")) +end + +@inline function cellfe(::AbstractKernelCellCache) + throw(ArgumentError("cellfe should be implemented in the derived type")) +end + + +## Concrete Implementation for CPU Multithreading ## + + +##### CPUKernelCellCache ##### +mutable struct CPUKernelCellCache{G <: AbstractGrid, X, Tv <: Real} <: AbstractKernelCellCache + const flags::UpdateFlags + const grid::G + const dh::AbstractDofHandler + const coords::Vector{X} + const dofs::Vector{Int} + cellid::Int + const nodes::Vector{Int} + const ke::Matrix{Tv} + const fe::Vector{Tv} +end + + +function CellCache(dh::DofHandler{dim}, n_basefuncs::Int, flags::UpdateFlags = UpdateFlags()) where {dim} + ke = zeros(Float64, n_basefuncs, n_basefuncs) + fe = zeros(Float64, n_basefuncs) + grid = dh |> get_grid + N = nnodes_per_cell(grid, 1) # nodes and coords will be resized in `reinit!` + nodes = zeros(Int, N) + coords = zeros(Vec{dim, Float64}, N) + return CPUKernelCellCache(flags, grid, dh, coords, Int[], -1, nodes, ke, fe) +end + +## TODO: to be removed (kept for reference) +# function _makecache(iterator::CPUKernelCellIterator, e::Ti) where {Ti <: Integer} +# dh = iterator.dh |> dofhandler +# grid = iterator.grid +# cellid = e +# cell = getcells(grid, e) + +# # Extract the node IDs of the cell. +# nodes = SVector(get_node_ids(cell)...) + +# # Extract the degrees of freedom for the cell. +# dofs = celldofs(dh, e) +# # Get the coordinates of the nodes of the cell. +# CT = get_coordinate_type(grid) +# N = nnodes(cell) +# x = MVector{N, CT}(undef) +# for i in eachindex(x) +# x[i] = get_node_coordinate(grid, nodes[i]) +# end +# coords = SVector(x...) + +# # Return the GPUCellCache containing the cell's data. +# return CPUKernelCellCache(coords, dofs, cellid, nodes, iterator.ke, iterator.fe) +# end + +function _reinit!(cc::CPUKernelCellCache, i::Int) + cc.cellid = i + fill!(cc.ke, zero(eltype(cc.ke))) + fill!(cc.fe, zero(eltype(cc.fe))) + if cc.flags.nodes + resize!(cc.nodes, nnodes_per_cell(cc.grid, i)) + cellnodes!(cc.nodes, cc.grid, i) + end + if cc.flags.coords + resize!(cc.coords, nnodes_per_cell(cc.grid, i)) + getcoordinates!(cc.coords, cc.grid, i) + end + if cc.flags.dofs + resize!(cc.dofs, ndofs_per_cell(cc.dh, i)) + celldofs!(cc.dofs, cc.dh, i) + end + return cc +end + +## Accessors ## +getnodes(cc::CPUKernelCellCache) = cc.nodes + + +getcoordinates(cc::CPUKernelCellCache) = cc.coords + + +celldofs(cc::CPUKernelCellCache) = cc.dofs + + +cellid(cc::CPUKernelCellCache) = cc.cellid + + +cellke(cc::CPUKernelCellCache) = cc.ke + +cellfe(cc::CPUKernelCellCache) = cc.fe + + +##### CPUKernelCellIterator ##### +struct CPUKernelCellIterator{CC <: CPUKernelCellCache, DH <: ColoringDofHandler} <: AbstractKernelCellIterator + cache::CC + dh::DH + n_cells::Int + thread_id::Int # thread id that the iterator is working on +end + + +function CellIterator(dh::ColoringDofHandler, n_basefuncs::Int) + grid = dh |> dofhandler |> get_grid + n_cells = grid |> getncells + cache = CellCache(dh |> dofhandler, n_basefuncs) + local_thread_id = Threads.threadid() + return CPUKernelCellIterator(cache, dh, n_cells, local_thread_id) +end + + +ncells(iterator::CPUKernelCellIterator) = iterator.n_cells +_cache(iterator::CPUKernelCellIterator) = iterator.cache + + +function Base.iterate(iterator::CPUKernelCellIterator) + i = iterator.thread_id + curr_color = iterator.dh |> current_color # current color that's being processed + eles_color = eles_in_color(iterator.dh, curr_color) # elements in the current color + ncells = length(eles_color) + i <= ncells || return nothing + cache = _cache(iterator) + _reinit!(cache, eles_color[i]) + return (cache, i) +end + + +function Base.iterate(iterator::CPUKernelCellIterator, state) + stride = Threads.nthreads() + i = state + stride # next strided element id + curr_color = iterator.dh |> current_color # current color that's being processed + eles_color = eles_in_color(iterator.dh, curr_color) # elements in the current color + ncells = length(eles_color) + i <= ncells || return nothing + cache = _cache(iterator) + _reinit!(cache, eles_color[i]) + return (cache, i) +end diff --git a/src/exports.jl b/src/exports.jl index c710861c15..07a3c684d4 100644 --- a/src/exports.jl +++ b/src/exports.jl @@ -104,6 +104,7 @@ export addcellset!, transform_coordinates!, generate_grid, + GPUGrid, # Grid coloring create_coloring, @@ -122,6 +123,7 @@ export DofOrder, evaluate_at_grid_nodes, apply_analytical!, + get_grid, # Sparsity pattern # AbstractSparsityPattern, @@ -184,4 +186,25 @@ export evaluate_at_points, PointIterator, PointLocation, - PointValues + PointValues, + + # GPU + LazyKernel, + StaticQuadratureView, + StaticInterpolationValues, + init_kernel, + launch!, + cellke, + cellfe, + GPUSubDofHandler, + GPUDofHandler, + GPUGrid, + getweights, + getbackend, + BackendCUDA, + BackendCPU, + LocalsGPUDofHandler, + dofhandler, + localfes, + localkes, + AbstractMemAlloc diff --git a/test/GPU/runtests.jl b/test/GPU/runtests.jl index 7d317327a5..3f92322ffd 100644 --- a/test/GPU/runtests.jl +++ b/test/GPU/runtests.jl @@ -1,5 +1,12 @@ -using CUDA using Ferrite +using CUDA using Test +using SparseArrays + +include("test_utils.jl") -@test CUDA.functional() +# Unit tests +include("test_assemble.jl") +include("test_iterator.jl") +include("test_kernellauncher.jl") +include("test_adapt.jl") diff --git a/test/GPU/test_adapt.jl b/test/GPU/test_adapt.jl new file mode 100644 index 0000000000..e8bc9cea6e --- /dev/null +++ b/test/GPU/test_adapt.jl @@ -0,0 +1,66 @@ +function dofs_cpu(dh, cv) + nbasefuncs = cv |> getnbasefunctions + ncells = dh |> get_grid |> getncells + dofs = zeros(Int32, nbasefuncs, ncells) + for i in 1:ncells + cdofs = celldofs(dh, i) + dofs[:, i] .= cdofs + end + return dofs +end + + +function dofs_gpu_kernel(dofs, dh, cv; mem_alloc) + nbasefuncs = cv |> getnbasefunctions + for cell in CellIterator(dh, mem_alloc) + cdofs = celldofs(cell) + dofs[:, cellid(cell)] .= cdofs + end + return nothing +end + +weights_cpu(cv) = cv.qr |> getweights + +function weights_gpu_kernel(weights, cv) + nweights = length(weights) + for i in 1:nweights + weights[i] = cv.weights[i] + end + return +end + +function nodes_cpu(grid) + nodes = grid.cells .|> (x -> x.nodes |> collect) + return hcat(nodes...) +end + +function nodes_gpu_kernel(nodes, dh, cv; mem_alloc) + nbasefuncs = cv |> getnbasefunctions + for cell in CellIterator(dh, mem_alloc) + cnodes = getnodes(cell) + nodes[:, cellid(cell)] .= cnodes + end + return nothing +end + +@testset "Adapt" begin + dh, cv = generate_Bilinear_problem() + cpudofs = dofs_cpu(dh, cv) |> cu + ncells = dh |> get_grid |> getncells + nbasefunctions = cv |> getnbasefunctions + gpudofs = zeros(Int32, nbasefunctions, ncells) |> cu + init_kernel(BackendCUDA, ncells, nbasefunctions, dofs_gpu_kernel, (gpudofs, dh, cv)) |> launch! + ## Test that dofs are correctly transfered to the GPU + @test all(cpudofs .== gpudofs) + ## Test that weights are correctly transfered to the GPU + cpuweights = weights_cpu(cv) |> cu + gpuweights = zeros(Float32, length(cpuweights)) |> cu + @cuda blocks = 1 threads = 1 weights_gpu_kernel(gpuweights, cv) + @test all(cpuweights .== gpuweights) + ## Test that nodes are correctly transfered to the GPU + cpunodes = nodes_cpu(dh |> get_grid) |> cu + n_nodes = length(cpunodes) + gpu_cellnodes = CUDA.zeros(Int32, nbasefunctions, ncells) + init_kernel(BackendCUDA, ncells, nbasefunctions, nodes_gpu_kernel, (gpu_cellnodes, dh, cv)) |> launch! + @test all(cpunodes .== gpu_cellnodes) +end diff --git a/test/GPU/test_assemble.jl b/test/GPU/test_assemble.jl new file mode 100644 index 0000000000..d49211622a --- /dev/null +++ b/test/GPU/test_assemble.jl @@ -0,0 +1,43 @@ +# Helper function to initialize sparse matrices +function init_sparse_matrix(n, m) + row_indices = Int32[] + col_indices = Int32[] + values = Float32[] + for i in 1:min(n, m) + push!(row_indices, i) + push!(col_indices, i) + push!(values, Float32(0.0)) + end + sparse_matrix = sparse(row_indices, col_indices, values, n, m) + return sparse_matrix +end + + +function assemble_kernel!(K, f, dofs, Ke, fe) + # kernel that only assembles local into global. + A = start_assemble(K, f) + return assemble!(A, dofs, Ke, fe) +end + +# Test for assembling global stiffness matrix and force vector +@testset "Test assemble!" begin + # System parameters + n = 5 + m = 5 + dofs = Int32[1, 2, 3, 4, 5] + Ke = CUDA.fill(1.0f0, n, m) # Local stiffness matrix (shared memory) + fe = CUDA.fill(1.0f0, n) # Local force vector (shared memory) + + # Initialize global stiffness matrix and global force vector + K = CUSPARSE.CuSparseMatrixCSC(init_sparse_matrix(n, m)) + + f = CUDA.fill(0.0f0, n) + + @cuda blocks = 5 threads = 5 assemble_kernel!(K, f, cu(dofs), Ke, fe) # 5 * 5 = 25 threads + + # Test: Check force vector update + @test all(f .≈ CUDA.fill(25.0f0, n)) + # Test: Check global stiffness matrix update (values must be incremented by 25 = 5 * 5) + @test all(K.nzVal .≈ CUDA.fill(25.0f0, length(K.nzVal))) + +end diff --git a/test/GPU/test_iterator.jl b/test/GPU/test_iterator.jl new file mode 100644 index 0000000000..3a0b4f8e19 --- /dev/null +++ b/test/GPU/test_iterator.jl @@ -0,0 +1,131 @@ +function getalldofs(dh) + ncells = dh |> get_grid |> getncells + return map(i -> celldofs(dh, i) .|> Int32, 1:ncells) |> (x -> hcat(x...)) |> cu +end + + +function dof_kernel_kernel!(dofs, dh, n_basefuncs; mem_alloc) + # this kernel is used to get all the dofs of the grid, which then + # can be validated against the correct dofs (i.e. CPU version). + for cell in CellIterator(dh, mem_alloc) + cid = cellid(cell) + cdofs = celldofs(cell) + for i in 1:n_basefuncs + dofs[Int32(i), cid] = cdofs[Int32(i)] + end + end + return nothing +end + + +function assemble_element_gpu!(Ke, fe, cv, cell) + n_basefuncs = getnbasefunctions(cv) + for qv in Ferrite.QuadratureValuesIterator(cv, getcoordinates(cell)) + ## Get the quadrature weight + dΩ = getdetJdV(qv) + ## Loop over test shape functions + for i in 1:n_basefuncs + δu = shape_value(qv, i) + ∇u = shape_gradient(qv, i) + ## Add contribution to fe + fe[i] += δu * dΩ + #fe_shared[tx,i] += δu * dΩ + ## Loop over trial shape functions + for j in 1:n_basefuncs + ∇δu = shape_gradient(qv, j) + ## Add contribution to Ke + Ke[i, j] += (∇δu ⋅ ∇u) * dΩ + end + end + end + return +end + + +function assemble_element_cpu!(Ke, fe, cellvalues) + n_basefuncs = getnbasefunctions(cellvalues) + # Loop over quadrature points + for q_point in 1:getnquadpoints(cellvalues) + # Get the quadrature weight + dΩ = getdetJdV(cellvalues, q_point) + # Loop over test shape functions + for i in 1:n_basefuncs + δu = shape_value(cellvalues, q_point, i) + ∇δu = shape_gradient(cellvalues, q_point, i) + # Add contribution to fe + fe[i] += δu * dΩ + # Loop over trial shape functions + for j in 1:n_basefuncs + ∇u = shape_gradient(cellvalues, q_point, j) + # Add contribution to Ke + Ke[i, j] += (∇δu ⋅ ∇u) * dΩ + end + end + end + return Ke, fe +end + +function localkefe_kernel!(kes, fes, cv, dh; mem_alloc) + nbasefuncs = getnbasefunctions(cv) + for cell in CellIterator(dh, mem_alloc) + Ke = cellke(cell) + fe = cellfe(cell) + assemble_element_gpu!(Ke, fe, cv, cell) + kes[cellid(cell), :, :] .= Ke + fes[cellid(cell), :] .= fe + end + return nothing +end + +function get_cpu_kefe(dh, cellvalues) + ncells = dh |> get_grid |> getncells + n_basefuncs = getnbasefunctions(cellvalues) + kes = zeros(Float32, ncells, n_basefuncs, n_basefuncs) + fes = zeros(Float32, ncells, n_basefuncs) + for cell in CellIterator(dh) + cid = cellid(cell) + reinit!(cellvalues, cell) + # Compute element contribution + assemble_element_cpu!((@view kes[cid, :, :]), (@view fes[cid, :, :]), cellvalues) + end + return kes |> cu, fes |> cu +end + +@testset "Test shared memory iterators" begin + dh, cellvalues = generate_Bilinear_problem() + n_basefuncs = getnbasefunctions(cellvalues) + # 1 Test that dofs for each cell in the grid are correctly computed + ncells = dh |> get_grid |> getncells + dofs = CUDA.fill(Int32(0), n_basefuncs, ncells) + correct_dofs = getalldofs(dh) + init_kernel(BackendCUDA, ncells, n_basefuncs, dof_kernel_kernel!, (dofs, dh, n_basefuncs)) |> launch! + @test all(dofs .≈ correct_dofs) + + # 2. Test that local ke and fe are correctly computed + kes_gpu = CUDA.fill(0.0f0, ncells, n_basefuncs, n_basefuncs) + fes_gpu = CUDA.fill(0.0f0, ncells, n_basefuncs) + init_kernel(BackendCUDA, ncells, n_basefuncs, localkefe_kernel!, (kes_gpu, fes_gpu, cellvalues, dh)) |> launch! + kes_cpu, fes_cpu = get_cpu_kefe(dh, cellvalues) + @test all(abs.(kes_gpu .- kes_cpu) .< 1.0e-3) #TODO: This needs further investigation + @test all(fes_gpu .≈ fes_cpu) +end + + +@testset "Test global memory iterators" begin + dh, cellvalues = generate_Biquadratic_problem() + n_basefuncs = getnbasefunctions(cellvalues) + # 1 Test that dofs for each cell in the grid are correctly computed + ncells = dh |> get_grid |> getncells + dofs = CUDA.fill(Int32(0), n_basefuncs, ncells) + correct_dofs = getalldofs(dh) + init_kernel(BackendCUDA, ncells, n_basefuncs, dof_kernel_kernel!, (dofs, dh, n_basefuncs)) |> launch! + @test all(dofs .≈ correct_dofs) + + # 2. Test that local ke and fe are correctly computed + kes_gpu = CUDA.fill(0.0f0, ncells, n_basefuncs, n_basefuncs) + fes_gpu = CUDA.fill(0.0f0, ncells, n_basefuncs) + init_kernel(BackendCUDA, ncells, n_basefuncs, localkefe_kernel!, (kes_gpu, fes_gpu, cellvalues, dh)) |> launch! + kes_cpu, fes_cpu = get_cpu_kefe(dh, cellvalues) + @test all(abs.(kes_gpu .- kes_cpu) .< 1.0e-1) #TODO: This needs further investigation + @test all(fes_gpu .≈ fes_cpu) +end diff --git a/test/GPU/test_kernellauncher.jl b/test/GPU/test_kernellauncher.jl new file mode 100644 index 0000000000..64e9294d0e --- /dev/null +++ b/test/GPU/test_kernellauncher.jl @@ -0,0 +1,33 @@ +# Sample CUDA Kernel (adding two vectors) +function kernel_add(A, B, C, n; mem_alloc) + i = threadIdx().x + (blockIdx().x - 1) * blockDim().x + if i <= n + C[i] = A[i] + B[i] + end + return +end + +# Helper function to launch the kernel with CUDAKernelLauncher +function test_launch_kernel!(n_cells::Integer, n_basefuncs::Integer, args...) + return init_kernel(BackendCUDA, n_cells, n_basefuncs, kernel_add, args) |> launch! +end + +# Testing for different integer types +@testset "Testing CUDAKernelLauncher with different integer types" begin + # Arrays for testing + N = 10 + A = CUDA.fill(1.0f0, N) + B = CUDA.fill(2.0f0, N) + C = CUDA.fill(0.0f0, N) + + # Test with Int32 + test_launch_kernel!(Int32(N), Int32(2), A, B, C, N) + CUDA.synchronize() + @test all(Array(C) .== 3.0f0) + + # Test with Int64 + fill!(C, 0.0f0) # reset C array + test_launch_kernel!(Int64(N), Int64(2), A, B, C, N) + CUDA.synchronize() + @test all(Array(C) .== 3.0f0) +end diff --git a/test/GPU/test_utils.jl b/test/GPU/test_utils.jl new file mode 100644 index 0000000000..35d1ac5307 --- /dev/null +++ b/test/GPU/test_utils.jl @@ -0,0 +1,47 @@ +function generate_Bilinear_problem() + left = Tensor{1, 2, Float32}((0, -0)) + + right = Tensor{1, 2, Float32}((rand(10.0:100000.0), rand(10.0:100000.0))) + + grid_dims = (rand(1:1000), rand(1:1000)) + + grid = generate_grid(Quadrilateral, grid_dims, left, right) + + ip = Lagrange{RefQuadrilateral, 1}() # define the interpolation function (i.e. Bilinear lagrange) + + qr = QuadratureRule{RefQuadrilateral}(Float32, 2) + + cellvalues = CellValues(Float32, qr, ip) + + dh = DofHandler(grid) + + add!(dh, :u, ip) + + close!(dh) + + return dh, cellvalues +end + +function generate_Biquadratic_problem() + left = Tensor{1, 2, Float32}((0, -0)) + + right = Tensor{1, 2, Float32}((rand(10.0:100000.0), rand(10.0:100000.0))) + + grid_dims = (rand(100:1000), rand(100:1000)) # to make sure the problem is big enough to use `CUDAGlobalCellIterator` + + grid = generate_grid(Quadrilateral, grid_dims, left, right) + + ip = Lagrange{RefQuadrilateral, 2}() # define the interpolation function (i.e. Biquadratic lagrange) + + qr = QuadratureRule{RefQuadrilateral}(Float32, 3) # 3x3 quadrature rule + + cellvalues = CellValues(Float32, qr, ip) + + dh = DofHandler(grid) + + add!(dh, :u, ip) + + close!(dh) + + return dh, cellvalues +end diff --git a/test/runtests.jl b/test/runtests.jl index 57cd82d8a7..3eb60d29ac 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -42,7 +42,7 @@ include("blockarrays.jl") include("test_examples.jl") @test all(x -> isdefined(Ferrite, x), names(Ferrite)) # Test that all exported symbols are defined -# # See which is not defined if fails +# See which is not defined if fails # for name in names(Ferrite) # isdefined(Ferrite, name) || @warn "Ferrite.$name is not defined but $name is exported" # end