diff --git a/Project.toml b/Project.toml
index e3c5eb3fbd..5b1a20788b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,11 +16,14 @@ Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4"
 WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
 
 [weakdeps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b"
 
 [extensions]
 FerriteBlockArrays = "BlockArrays"
+FerriteCuda = ["CUDA", "Adapt"]
 FerriteMetis = "Metis"
 
 [compat]
@@ -34,6 +37,7 @@ Preferences = "1"
 Reexport = "1"
 StaticArrays = "1"
 Tensors = "1.14"
+TimerOutputs = "0.5.25"
 WriteVTK = "1.13"
 julia = "1.10"
 
diff --git a/docs/Manifest.toml b/docs/Manifest.toml
index d627674895..1c748a4051 100644
--- a/docs/Manifest.toml
+++ b/docs/Manifest.toml
@@ -2,7 +2,7 @@
 
 julia_version = "1.11.1"
 manifest_format = "2.0"
-project_hash = "4e52f4aa4cee9f66ec4f633f0ae538fbd227ac5e"
+project_hash = "f5e9904007b2fd3f4bda9fc8dd33eb629bc31d18"
 
 [[deps.ADTypes]]
 git-tree-sha1 = "eea5d80188827b35333801ef97a40c2ed653b081"
@@ -19,6 +19,17 @@ git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c"
 uuid = "a4c015fc-c6ff-483c-b24f-f7ea428134e9"
 version = "0.0.1"
 
+[[deps.AbstractFFTs]]
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
+uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
+version = "1.5.0"
+weakdeps = ["ChainRulesCore", "Test"]
+
+    [deps.AbstractFFTs.extensions]
+    AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
+    AbstractFFTsTestExt = "Test"
+
 [[deps.AbstractTrees]]
 git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@@ -113,6 +124,18 @@ weakdeps = ["SparseArrays"]
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
 version = "1.11.0"
 
+[[deps.Atomix]]
+deps = ["UnsafeAtomics"]
+git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
+uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
+version = "0.1.0"
+
+[[deps.BFloat16s]]
+deps = ["LinearAlgebra", "Printf", "Random", "Test"]
+git-tree-sha1 = "2c7cc21e8678eff479978a0a2ef5ce2f51b63dff"
+uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
+version = "0.5.0"
+
 [[deps.BangBang]]
 deps = ["Accessors", "ConstructionBase", "InitialValues", "LinearAlgebra", "Requires"]
 git-tree-sha1 = "e2144b631226d9eeab2d746ca8880b7ccff504ae"
@@ -185,12 +208,47 @@ git-tree-sha1 = "9e2a6b69137e6969bab0152632dcb3bc108c8bdd"
 uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
 version = "1.0.8+1"
 
+[[deps.CEnum]]
+git-tree-sha1 = "389ad5c84de1ae7cf0e28e381131c98ea87d54fc"
+uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
+version = "0.5.0"
+
 [[deps.CPUSummary]]
 deps = ["CpuId", "IfElse", "PrecompileTools", "Static"]
 git-tree-sha1 = "5a97e67919535d6841172016c9530fd69494e5ec"
 uuid = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
 version = "0.2.6"
 
+[[deps.CUDA]]
+deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LLVMLoopInfo", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "StaticArrays", "Statistics", "demumble_jll"]
+git-tree-sha1 = "e0725a467822697171af4dae15cec10b4fc19053"
+uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
+version = "5.5.2"
+weakdeps = ["ChainRulesCore", "EnzymeCore", "SpecialFunctions"]
+
+    [deps.CUDA.extensions]
+    ChainRulesCoreExt = "ChainRulesCore"
+    EnzymeCoreExt = "EnzymeCore"
+    SpecialFunctionsExt = "SpecialFunctions"
+
+[[deps.CUDA_Driver_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "ccd1e54610c222fadfd4737dac66bff786f63656"
+uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
+version = "0.10.3+0"
+
+[[deps.CUDA_Runtime_Discovery]]
+deps = ["Libdl"]
+git-tree-sha1 = "33576c7c1b2500f8e7e6baa082e04563203b3a45"
+uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
+version = "0.3.5"
+
+[[deps.CUDA_Runtime_jll]]
+deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
+git-tree-sha1 = "e43727b237b2879a34391eeb81887699a26f8f2f"
+uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
+version = "0.15.3+0"
+
 [[deps.Cairo_jll]]
 deps = ["Artifacts", "Bzip2_jll", "CompilerSupportLibraries_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"]
 git-tree-sha1 = "009060c9a6168704143100f36ab08f06c2af4642"
@@ -223,6 +281,12 @@ git-tree-sha1 = "05ba0d07cd4fd8b7a39541e31a7b0254704ea581"
 uuid = "fb6a15b2-703c-40df-9091-08a04967cfa9"
 version = "0.1.13"
 
+[[deps.CodeTracking]]
+deps = ["InteractiveUtils", "UUIDs"]
+git-tree-sha1 = "7eee164f122511d3e4e1ebadb7956939ea7e1c77"
+uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
+version = "1.3.6"
+
 [[deps.CodecZlib]]
 deps = ["TranscodingStreams", "Zlib_jll"]
 git-tree-sha1 = "bce6804e5e6044c6daab27bb533d1295e4a2e759"
@@ -334,11 +398,28 @@ git-tree-sha1 = "fcbb72b032692610bfbdb15018ac16a36cf2e406"
 uuid = "adafc99b-e345-5852-983c-f28acb93d879"
 version = "0.3.1"
 
+[[deps.Crayons]]
+git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
+uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
+version = "4.1.1"
+
+[[deps.Cthulhu]]
+deps = ["CodeTracking", "FoldingTrees", "InteractiveUtils", "JuliaSyntax", "PrecompileTools", "Preferences", "REPL", "TypedSyntax", "UUIDs", "Unicode", "WidthLimitedIO"]
+git-tree-sha1 = "9f167682b994c3d6f841d2d7cda444bf3b918e37"
+uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f"
+version = "2.15.2"
+
 [[deps.DataAPI]]
 git-tree-sha1 = "abe83f3a2f1b857aac70ef8b269080af17764bbe"
 uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
 version = "1.16.0"
 
+[[deps.DataFrames]]
+deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
+git-tree-sha1 = "fb61b4812c49343d7ef0b533ba982c46021938a6"
+uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+version = "1.7.0"
+
 [[deps.DataStructures]]
 deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
 git-tree-sha1 = "1d0a14036acb104d9e89698bd408f63ab58cdc82"
@@ -368,10 +449,10 @@ uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 version = "1.9.1"
 
 [[deps.DiffEqBase]]
-deps = ["ArrayInterface", "ConcreteStructs", "DataStructures", "DocStringExtensions", "EnumX", "EnzymeCore", "FastBroadcast", "FastClosures", "ForwardDiff", "FunctionWrappers", "FunctionWrappersWrappers", "LinearAlgebra", "Logging", "Markdown", "MuladdMacro", "Parameters", "PreallocationTools", "PrecompileTools", "Printf", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "SciMLStructures", "Setfield", "Static", "StaticArraysCore", "Statistics", "Tricks", "TruncatedStacktraces"]
-git-tree-sha1 = "8977ef8249b602e4cb46ddbaf3c51e6adc2958c7"
+deps = ["ArrayInterface", "ConcreteStructs", "DataStructures", "DocStringExtensions", "EnumX", "EnzymeCore", "FastBroadcast", "FastClosures", "ForwardDiff", "FunctionWrappers", "FunctionWrappersWrappers", "LinearAlgebra", "Logging", "Markdown", "MuladdMacro", "Parameters", "PreallocationTools", "PrecompileTools", "Printf", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "SciMLStructures", "Setfield", "Static", "StaticArraysCore", "Statistics", "TruncatedStacktraces"]
+git-tree-sha1 = "9c36ac68cf6c59a1c2569aadd7a62c47ed5c8eb5"
 uuid = "2b5f629d-d688-5b77-993f-72d75c75574e"
-version = "6.157.0"
+version = "6.158.1"
 
     [deps.DiffEqBase.extensions]
     DiffEqBaseCUDAExt = "CUDA"
@@ -414,10 +495,10 @@ uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
 version = "1.15.1"
 
 [[deps.DifferentiationInterface]]
-deps = ["ADTypes", "Compat", "LinearAlgebra", "PackageExtensionCompat"]
-git-tree-sha1 = "75d1716eb46e1b77304f7d79ec9ac6a4e6185d02"
+deps = ["ADTypes", "LinearAlgebra"]
+git-tree-sha1 = "76ea82755a5c93b7b26d4fca990854fd2fc74b6e"
 uuid = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
-version = "0.6.7"
+version = "0.6.11"
 
     [deps.DifferentiationInterface.extensions]
     DifferentiationInterfaceChainRulesCoreExt = "ChainRulesCore"
@@ -432,6 +513,7 @@ version = "0.6.7"
     DifferentiationInterfaceReverseDiffExt = "ReverseDiff"
     DifferentiationInterfaceSparseArraysExt = "SparseArrays"
     DifferentiationInterfaceSparseMatrixColoringsExt = "SparseMatrixColorings"
+    DifferentiationInterfaceStaticArraysExt = "StaticArrays"
     DifferentiationInterfaceSymbolicsExt = "Symbolics"
     DifferentiationInterfaceTrackerExt = "Tracker"
     DifferentiationInterfaceZygoteExt = ["Zygote", "ForwardDiff"]
@@ -449,6 +531,7 @@ version = "0.6.7"
     ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
     SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
     SparseMatrixColorings = "0a514795-09f3-496d-8182-132a7b665d35"
+    StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
     Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7"
     Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
     Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
@@ -577,17 +660,21 @@ uuid = "29a986be-02c6-4525-aec4-84b980013641"
 version = "2.0.4"
 
 [[deps.Ferrite]]
-deps = ["EnumX", "ForwardDiff", "LinearAlgebra", "NearestNeighbors", "OrderedCollections", "Preferences", "Reexport", "SparseArrays", "StaticArrays", "Tensors", "WriteVTK"]
+deps = ["Cthulhu", "EnumX", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "NearestNeighbors", "OrderedCollections", "Preferences", "Reexport", "SparseArrays", "StaticArrays", "Tensors", "WriteVTK"]
 path = ".."
 uuid = "c061ca5d-56c9-439f-9c0e-210fe06d3992"
 version = "1.0.0"
 
     [deps.Ferrite.extensions]
     FerriteBlockArrays = "BlockArrays"
+    FerriteCuda = ["CUDA", "Adapt"]
     FerriteMetis = "Metis"
 
     [deps.Ferrite.weakdeps]
+    Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+    BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
     BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
+    CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
     Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b"
 
 [[deps.FerriteGmsh]]
@@ -652,6 +739,12 @@ git-tree-sha1 = "05882d6995ae5c12bb5f36dd2ed3f61c98cbb172"
 uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
 version = "0.8.5"
 
+[[deps.FoldingTrees]]
+deps = ["AbstractTrees", "REPL"]
+git-tree-sha1 = "d94efd85f2fe192cdf664aa8b7c431592faed59e"
+uuid = "1eca21be-9b9b-4ed8-839a-6d8ae26b1781"
+version = "1.2.1"
+
 [[deps.Fontconfig_jll]]
 deps = ["Artifacts", "Bzip2_jll", "Expat_jll", "FreeType2_jll", "JLLWrappers", "Libdl", "Libuuid_jll", "Zlib_jll"]
 git-tree-sha1 = "db16beca600632c95fc8aca29890d83788dd8b23"
@@ -718,12 +811,24 @@ deps = ["Artifacts", "Libdl"]
 uuid = "781609d7-10c4-51f6-84f2-b8444358ff6d"
 version = "6.3.0+0"
 
+[[deps.GPUArrays]]
+deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
+git-tree-sha1 = "62ee71528cca49be797076a76bdc654a170a523e"
+uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+version = "10.3.1"
+
 [[deps.GPUArraysCore]]
 deps = ["Adapt"]
 git-tree-sha1 = "ec632f177c0d990e64d955ccc1b8c04c485a0950"
 uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
 version = "0.1.6"
 
+[[deps.GPUCompiler]]
+deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "PrecompileTools", "Preferences", "Scratch", "Serialization", "TOML", "TimerOutputs", "UUIDs"]
+git-tree-sha1 = "1d6f290a5eb1201cd63574fbc4440c788d5cb38f"
+uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
+version = "0.27.8"
+
 [[deps.GR]]
 deps = ["Artifacts", "Base64", "DelimitedFiles", "Downloads", "GR_jll", "HTTP", "JSON", "Libdl", "LinearAlgebra", "Preferences", "Printf", "Qt6Wayland_jll", "Random", "Serialization", "Sockets", "TOML", "Tar", "Test", "p7zip_jll"]
 git-tree-sha1 = "629693584cef594c3f6f99e76e7a7ad17e60e8d5"
@@ -840,6 +945,19 @@ git-tree-sha1 = "4da0f88e9a39111c2fa3add390ab15f3a44f3ca3"
 uuid = "22cec73e-a1b8-11e9-2c92-598750a2cf9c"
 version = "0.3.1"
 
+[[deps.InlineStrings]]
+git-tree-sha1 = "45521d31238e87ee9f9732561bfee12d4eebd52d"
+uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
+version = "1.4.2"
+
+    [deps.InlineStrings.extensions]
+    ArrowTypesExt = "ArrowTypes"
+    ParsersExt = "Parsers"
+
+    [deps.InlineStrings.weakdeps]
+    ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd"
+    Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
+
 [[deps.IntelOpenMP_jll]]
 deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl"]
 git-tree-sha1 = "10bd689145d2c3b2a9844005d01087cc1194e79e"
@@ -861,6 +979,11 @@ weakdeps = ["Dates", "Test"]
     InverseFunctionsDatesExt = "Dates"
     InverseFunctionsTestExt = "Test"
 
+[[deps.InvertedIndices]]
+git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038"
+uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
+version = "1.3.0"
+
 [[deps.IrrationalConstants]]
 git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
 uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
@@ -885,9 +1008,9 @@ version = "0.1.8"
 
 [[deps.JLLWrappers]]
 deps = ["Artifacts", "Preferences"]
-git-tree-sha1 = "f389674c99bfcde17dc57454011aa44d5a260a40"
+git-tree-sha1 = "be3dc50a92e5a386872a493a10050136d4703f9b"
 uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
-version = "1.6.0"
+version = "1.6.1"
 
 [[deps.JSON]]
 deps = ["Dates", "Mmap", "Parsers", "Unicode"]
@@ -919,12 +1042,35 @@ git-tree-sha1 = "25ee0be4d43d0269027024d75a24c24d6c6e590c"
 uuid = "aacddb02-875f-59d6-b918-886e6ef4fbf8"
 version = "3.0.4+0"
 
+[[deps.JuliaNVTXCallbacks_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f"
+uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e"
+version = "0.2.1+0"
+
+[[deps.JuliaSyntax]]
+git-tree-sha1 = "937da4713526b96ac9a178e2035019d3b78ead4a"
+uuid = "70703baa-626e-46a2-a12c-08ffd08c73b4"
+version = "0.4.10"
+
 [[deps.KLU]]
 deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse_jll"]
 git-tree-sha1 = "07649c499349dad9f08dde4243a4c597064663e9"
 uuid = "ef3ab10e-7fda-4108-b977-705223b18434"
 version = "0.6.0"
 
+[[deps.KernelAbstractions]]
+deps = ["Adapt", "Atomix", "InteractiveUtils", "MacroTools", "PrecompileTools", "Requires", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
+git-tree-sha1 = "04e52f596d0871fa3890170fa79cb15e481e4cd8"
+uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+version = "0.9.28"
+weakdeps = ["EnzymeCore", "LinearAlgebra", "SparseArrays"]
+
+    [deps.KernelAbstractions.extensions]
+    EnzymeExt = "EnzymeCore"
+    LinearAlgebraExt = "LinearAlgebra"
+    SparseArraysExt = "SparseArrays"
+
 [[deps.Krylov]]
 deps = ["LinearAlgebra", "Printf", "SparseArrays"]
 git-tree-sha1 = "267dad6b4b7b5d529c76d40ff48d33f7e94cb834"
@@ -943,6 +1089,27 @@ git-tree-sha1 = "bf36f528eec6634efc60d7ec062008f171071434"
 uuid = "88015f11-f218-50d7-93a8-a6af411a945d"
 version = "3.0.0+1"
 
+[[deps.LLVM]]
+deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Preferences", "Printf", "Requires", "Unicode"]
+git-tree-sha1 = "4ad43cb0a4bb5e5b1506e1d1f48646d7e0c80363"
+uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
+version = "9.1.2"
+weakdeps = ["BFloat16s"]
+
+    [deps.LLVM.extensions]
+    BFloat16sExt = "BFloat16s"
+
+[[deps.LLVMExtra_jll]]
+deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
+git-tree-sha1 = "05a8bd5a42309a9ec82f700876903abce1017dd3"
+uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
+version = "0.0.34+0"
+
+[[deps.LLVMLoopInfo]]
+git-tree-sha1 = "2e5c102cfc41f48ae4740c7eca7743cc7e7b75ea"
+uuid = "8b046642-f1f6-4319-8d3c-209ddc03c586"
+version = "1.0.0"
+
 [[deps.LLVMOpenMP_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
 git-tree-sha1 = "78211fb6cbc872f77cad3fc0b6cf647d923f4929"
@@ -1330,6 +1497,18 @@ git-tree-sha1 = "a0b464d183da839699f4c79e7606d9d186ec172c"
 uuid = "d41bc354-129a-5804-8e4c-c37616107c6c"
 version = "7.8.3"
 
+[[deps.NVTX]]
+deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"]
+git-tree-sha1 = "53046f0483375e3ed78e49190f1154fa0a4083a1"
+uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
+version = "0.3.4"
+
+[[deps.NVTX_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b"
+uuid = "e98f9f5b-d649-5603-91fd-7774390e6439"
+version = "3.1.0+2"
+
 [[deps.NaNMath]]
 deps = ["OpenLibm_jll"]
 git-tree-sha1 = "0877504529a3e5c3343c6f8b4c0381e57e4387e4"
@@ -1478,9 +1657,9 @@ version = "1.1.2"
 
 [[deps.OrdinaryDiffEqCore]]
 deps = ["ADTypes", "Accessors", "Adapt", "ArrayInterface", "DataStructures", "DiffEqBase", "DocStringExtensions", "EnumX", "FastBroadcast", "FastClosures", "FillArrays", "FunctionWrappersWrappers", "InteractiveUtils", "LinearAlgebra", "Logging", "MacroTools", "MuladdMacro", "Polyester", "PrecompileTools", "Preferences", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "SciMLStructures", "SimpleUnPack", "Static", "StaticArrayInterface", "StaticArraysCore", "TruncatedStacktraces"]
-git-tree-sha1 = "33e4292e832d439c3706410ae867f3c091d79155"
+git-tree-sha1 = "f4b7b11e2e4c0f4a7fe15d2edcec1e1ce2917d67"
 uuid = "bbf590c4-e513-4bbe-9b18-05decba2e5d8"
-version = "1.7.0"
+version = "1.7.1"
 weakdeps = ["EnzymeCore"]
 
     [deps.OrdinaryDiffEqCore.extensions]
@@ -1741,6 +1920,12 @@ git-tree-sha1 = "645bed98cd47f72f67316fd42fc47dee771aefcd"
 uuid = "1d0040c9-8b98-4ee7-8388-3f51789ca0ad"
 version = "0.2.2"
 
+[[deps.PooledArrays]]
+deps = ["DataAPI", "Future"]
+git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3"
+uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
+version = "1.4.3"
+
 [[deps.PositiveFactorizations]]
 deps = ["LinearAlgebra"]
 git-tree-sha1 = "17275485f373e6673f7e7f97051f703ed5b15b20"
@@ -1771,6 +1956,12 @@ git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6"
 uuid = "21216c6a-2e73-6563-6e65-726566657250"
 version = "1.4.3"
 
+[[deps.PrettyTables]]
+deps = ["Crayons", "LaTeXStrings", "Markdown", "PrecompileTools", "Printf", "Reexport", "StringManipulation", "Tables"]
+git-tree-sha1 = "1101cd475833706e4d0e7b122218257178f48f34"
+uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
+version = "2.4.0"
+
 [[deps.Printf]]
 deps = ["Unicode"]
 uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -1816,6 +2007,18 @@ deps = ["SHA"]
 uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 version = "1.11.0"
 
+[[deps.Random123]]
+deps = ["Random", "RandomNumbers"]
+git-tree-sha1 = "4743b43e5a9c4a2ede372de7061eed81795b12e7"
+uuid = "74087812-796a-5b5d-8853-05524746bad3"
+version = "1.7.0"
+
+[[deps.RandomNumbers]]
+deps = ["Random"]
+git-tree-sha1 = "c6ec94d2aaba1ab2ff983052cf6a606ca5985902"
+uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
+version = "1.6.0"
+
 [[deps.RecipesBase]]
 deps = ["PrecompileTools"]
 git-tree-sha1 = "5c3d09cc4f31f5fc6af001c250bf1278733100ff"
@@ -1970,6 +2173,12 @@ git-tree-sha1 = "3bac05bc7e74a75fd9cba4295cde4045d9fe2386"
 uuid = "6c6a2e73-6563-6170-7368-637461726353"
 version = "1.2.1"
 
+[[deps.SentinelArrays]]
+deps = ["Dates", "Random"]
+git-tree-sha1 = "ff11acffdb082493657550959d4feb4b6149e73a"
+uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
+version = "1.4.5"
+
 [[deps.Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 version = "1.11.0"
@@ -2041,10 +2250,10 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 version = "1.11.0"
 
 [[deps.SparseConnectivityTracer]]
-deps = ["ADTypes", "DocStringExtensions", "FillArrays", "LinearAlgebra", "Random", "Requires", "SparseArrays"]
-git-tree-sha1 = "e5efbf152d3e14a513f19a9119b810340b7ac86b"
+deps = ["ADTypes", "DocStringExtensions", "FillArrays", "LinearAlgebra", "Random", "SparseArrays"]
+git-tree-sha1 = "35d346f6aa78814859f1f69cd4f41c24108afa00"
 uuid = "9f842d2f-2579-4b1d-911e-f412cf18a3f5"
-version = "0.6.6"
+version = "0.6.7"
 
     [deps.SparseConnectivityTracer.extensions]
     SparseConnectivityTracerDataInterpolationsExt = "DataInterpolations"
@@ -2061,10 +2270,10 @@ version = "0.6.6"
     SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 
 [[deps.SparseDiffTools]]
-deps = ["ADTypes", "Adapt", "ArrayInterface", "Compat", "DataStructures", "FiniteDiff", "ForwardDiff", "Graphs", "LinearAlgebra", "PackageExtensionCompat", "Random", "Reexport", "SciMLOperators", "Setfield", "SparseArrays", "StaticArrayInterface", "StaticArrays", "Tricks", "UnPack", "VertexSafeGraphs"]
-git-tree-sha1 = "7db12cef226aaa8ca730040c9c35e32b86a69b83"
+deps = ["ADTypes", "Adapt", "ArrayInterface", "Compat", "DataStructures", "FiniteDiff", "ForwardDiff", "Graphs", "LinearAlgebra", "PackageExtensionCompat", "Random", "Reexport", "SciMLOperators", "Setfield", "SparseArrays", "StaticArrayInterface", "StaticArrays", "UnPack", "VertexSafeGraphs"]
+git-tree-sha1 = "b906758c107b049b6b71599b9f928d9b14e5554a"
 uuid = "47a9eef4-7e08-11e9-0b38-333d64bd3804"
-version = "2.22.0"
+version = "2.23.0"
 
     [deps.SparseDiffTools.extensions]
     SparseDiffToolsEnzymeExt = "Enzyme"
@@ -2081,10 +2290,14 @@ version = "2.22.0"
     Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [[deps.SparseMatrixColorings]]
-deps = ["ADTypes", "Compat", "DataStructures", "DocStringExtensions", "LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "ad17e2069015839e58a1f9275608b405fae1f28e"
+deps = ["ADTypes", "DataStructures", "DocStringExtensions", "LinearAlgebra", "Random", "SparseArrays"]
+git-tree-sha1 = "ccc32032d8f6790ad198c99fb8ef20d8100a0de4"
 uuid = "0a514795-09f3-496d-8182-132a7b665d35"
-version = "0.4.6"
+version = "0.4.7"
+weakdeps = ["Colors"]
+
+    [deps.SparseMatrixColorings.extensions]
+    SparseMatrixColoringsColorsExt = "Colors"
 
 [[deps.Sparspak]]
 deps = ["Libdl", "LinearAlgebra", "Logging", "OffsetArrays", "Printf", "SparseArrays", "Test"]
@@ -2174,6 +2387,12 @@ git-tree-sha1 = "b765e46ba27ecf6b44faf70df40c57aa3a547dcb"
 uuid = "69024149-9ee7-55f6-a4c4-859efe599b68"
 version = "0.3.7"
 
+[[deps.StringManipulation]]
+deps = ["PrecompileTools"]
+git-tree-sha1 = "a6b1675a536c5ad1a60e5a5153e1fee12eb146e3"
+uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e"
+version = "0.4.0"
+
 [[deps.StructTypes]]
 deps = ["Dates", "UUIDs"]
 git-tree-sha1 = "159331b30e94d7b11379037feeb9b690950cace8"
@@ -2262,17 +2481,18 @@ git-tree-sha1 = "be986ad9dac14888ba338c2554dcfec6939e1393"
 uuid = "d5829a12-d9aa-46ab-831f-fb7c9ab06edf"
 version = "0.2.1"
 
-[[deps.Tricks]]
-git-tree-sha1 = "7822b97e99a1672bfb1b49b668a6d46d58d8cbcb"
-uuid = "410a4b4d-49e4-4fbc-ab6d-cb71b17b3775"
-version = "0.1.9"
-
 [[deps.TruncatedStacktraces]]
 deps = ["InteractiveUtils", "MacroTools", "Preferences"]
 git-tree-sha1 = "ea3e54c2bdde39062abf5a9758a23735558705e1"
 uuid = "781d530d-4396-4725-bb49-402e4bee1e77"
 version = "1.4.0"
 
+[[deps.TypedSyntax]]
+deps = ["CodeTracking", "JuliaSyntax"]
+git-tree-sha1 = "1465a8187b3d512a99fef13244c213b54e34615d"
+uuid = "d265eb64-f81a-44ad-a842-4247ee1503de"
+version = "1.4.2"
+
 [[deps.URIs]]
 git-tree-sha1 = "67db6cc7b3821e19ebe75791a9dd19c9b1188f2b"
 uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
@@ -2315,6 +2535,17 @@ git-tree-sha1 = "975c354fcd5f7e1ddcc1f1a23e6e091d99e99bc8"
 uuid = "45397f5d-5981-4c77-b2b3-fc36d6e9b728"
 version = "1.6.4"
 
+[[deps.UnsafeAtomics]]
+git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
+uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
+version = "0.2.1"
+
+[[deps.UnsafeAtomicsLLVM]]
+deps = ["LLVM", "UnsafeAtomics"]
+git-tree-sha1 = "2d17fabcd17e67d7625ce9c531fb9f40b7c42ce4"
+uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
+version = "0.2.1"
+
 [[deps.Unzip]]
 git-tree-sha1 = "ca0969166a028236229f63514992fc073799bb78"
 uuid = "41fe7b60-77ed-43a1-b4f0-825fd5a5650d"
@@ -2355,6 +2586,12 @@ git-tree-sha1 = "93f43ab61b16ddfb2fd3bb13b3ce241cafb0e6c9"
 uuid = "2381bf8a-dfd0-557d-9999-79630e7b1b91"
 version = "1.31.0+0"
 
+[[deps.WidthLimitedIO]]
+deps = ["Unicode"]
+git-tree-sha1 = "71142739e695823729a335e9bc124ef41ec1433a"
+uuid = "b8c1c048-cf81-46c6-9da0-18c1d99e41f2"
+version = "1.0.1"
+
 [[deps.WriteVTK]]
 deps = ["Base64", "CodecZlib", "FillArrays", "LightXML", "TranscodingStreams", "VTKBase"]
 git-tree-sha1 = "e453383a1486a9020a3323bf3665ea31106ebe9a"
@@ -2546,6 +2783,12 @@ git-tree-sha1 = "555d1076590a6cc2fdee2ef1469451f872d8b41b"
 uuid = "3161d3a3-bdf6-5164-811a-617609db77b4"
 version = "1.5.6+1"
 
+[[deps.demumble_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "6498e3581023f8e530f34760d18f75a69e3a4ea8"
+uuid = "1e29f10c-031c-5a83-9565-69cddfc27673"
+version = "1.3.0+0"
+
 [[deps.eudev_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gperf_jll"]
 git-tree-sha1 = "431b678a28ebb559d224c0b6b6d01afce87c51ba"
diff --git a/docs/Project.toml b/docs/Project.toml
index ead8045637..8b56fa1c2e 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,7 @@
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Changelog = "5217a498-cd5d-4ec6-b8c2-9b85a09b6e3e"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
diff --git a/docs/src/literate-tutorials/gpu_qp_heat_equation.jl b/docs/src/literate-tutorials/gpu_qp_heat_equation.jl
new file mode 100644
index 0000000000..bb0699b26b
--- /dev/null
+++ b/docs/src/literate-tutorials/gpu_qp_heat_equation.jl
@@ -0,0 +1,202 @@
+using Ferrite
+using StaticArrays
+using SparseArrays
+using CUDA
+
+
+left = Tensor{1, 2, Float32}((0, -0)) # define the left bottom corner of the grid.
+right = Tensor{1, 2, Float32}((1.0, 1.0)) # define the right top corner of the grid.
+grid = generate_grid(Quadrilateral, (20, 20), left, right)
+
+
+ip = Lagrange{RefQuadrilateral, 2}() # define the interpolation function (i.e. Bilinear lagrange)
+qr = QuadratureRule{RefQuadrilateral}(Float32, 3)
+cellvalues = CellValues(Float32, qr, ip)
+
+
+dh = DofHandler(grid)
+add!(dh, :u, ip)
+close!(dh);
+
+
+# Standard assembly of the element.
+function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues)
+    n_basefuncs = getnbasefunctions(cellvalues)
+    ## Loop over quadrature points
+    for q_point in 1:getnquadpoints(cellvalues)
+        ## Get the quadrature weight
+        dΩ = getdetJdV(cellvalues, q_point)
+        ## Loop over test shape functions
+        for i in 1:n_basefuncs
+            δu = shape_value(cellvalues, q_point, i)
+            ∇δu = shape_gradient(cellvalues, q_point, i)
+            ## Add contribution to fe
+            fe[i] += δu * dΩ
+            ## Loop over trial shape functions
+            for j in 1:n_basefuncs
+                ∇u = shape_gradient(cellvalues, q_point, j)
+                ## Add contribution to Ke
+                Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
+            end
+        end
+    end
+    return Ke, fe
+end
+
+
+function assemble_element_qpiter!(Ke::Matrix, fe::Vector, cellvalues, cell_coords::AbstractVector)
+    n_basefuncs = getnbasefunctions(cellvalues)
+    ## Loop over quadrature points
+    for qv in Ferrite.QuadratureValuesIterator(cellvalues, cell_coords)
+        ## Get the quadrature weight
+        dΩ = getdetJdV(qv)
+        ## Loop over test shape functions
+        for i in 1:n_basefuncs
+            δu = shape_value(qv, i)
+            ∇δu = shape_gradient(qv, i)
+            ## Add contribution to fe
+            fe[i] += δu * dΩ
+            ## Loop over trial shape functions
+            for j in 1:n_basefuncs
+                ∇u = shape_gradient(qv, j)
+                ## Add contribution to Ke
+                Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
+            end
+        end
+    end
+    return Ke, fe
+end
+
+# Standard global assembly
+function assemble_global_qp!(cellvalues, dh::DofHandler, K, f)
+    n_basefuncs = getnbasefunctions(cellvalues)
+    Ke = zeros(eltype(K), n_basefuncs, n_basefuncs)
+    fe = zeros(eltype(f), n_basefuncs)
+    assembler = start_assemble(K, f)
+    ## Loop over all cels
+    for cell in CellIterator(dh)
+        fill!(Ke, 0)
+        fill!(fe, 0)
+        assemble_element_qpiter!(Ke, fe, cellvalues, getcoordinates(cell))
+        ## Assemble Ke and fe into K and f
+        assemble!(assembler, celldofs(cell), Ke, fe)
+    end
+    return K, f
+end
+
+function assemble_global_std!(cellvalues, dh::DofHandler, K, f)
+    n_basefuncs = getnbasefunctions(cellvalues)
+    Ke = zeros(eltype(K), n_basefuncs, n_basefuncs)
+    fe = zeros(eltype(f), n_basefuncs)
+    assembler = start_assemble(K, f)
+    ## Loop over all cels
+    for cell in CellIterator(dh)
+        fill!(Ke, 0)
+        fill!(fe, 0)
+        ## Reinitialize cellvalues for this cell
+        reinit!(cellvalues, cell)
+        ## Compute element contribution
+        assemble_element_std!(Ke, fe, cellvalues)
+        ## Assemble Ke and fe into K and f
+        assemble!(assembler, celldofs(cell), Ke, fe)
+    end
+    return K, f
+end
+
+
+## gpu version of element assembly
+function assemble_element!(Ke, fe, cv, cell)
+    n_basefuncs = getnbasefunctions(cv)
+    for qv in Ferrite.QuadratureValuesIterator(cv, getcoordinates(cell))
+        ## Get the quadrature weight
+        dΩ = getdetJdV(qv)
+        ## Loop over test shape functions
+        for i in 1:n_basefuncs
+            δu = shape_value(qv, i)
+            ∇u = shape_gradient(qv, i)
+            ## Add contribution to fe
+            fe[i] += δu * dΩ
+            ## fe_shared[tx,i] += δu * dΩ
+            ## Loop over trial shape functions
+            for j in 1:n_basefuncs
+                ∇δu = shape_gradient(qv, j)
+                ## Add contribution to Ke
+                Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
+            end
+        end
+    end
+    return
+end
+
+
+## gpu version of global assembly
+function assemble_gpu!(Kgpu, fgpu, dh, cv; mem_alloc::AbstractMemAlloc)
+    assembler = start_assemble(Kgpu, fgpu; fillzero = false) ## has to be always false
+    for cell in CellIterator(dh, mem_alloc)
+        Ke = cellke(cell)
+        fe = cellfe(cell)
+        assemble_element!(Ke, fe, cv, cell)
+        assemble!(assembler, celldofs(cell), Ke, fe)
+    end
+    return nothing
+end
+
+
+n_basefuncs = getnbasefunctions(cellvalues) |> Int32
+
+## Allocate CPU matrix
+## K = allocate_matrix(SparseMatrixCSC{Float64, Int64}, dh);
+#f = zeros(eltype(K), ndofs(dh));
+
+
+## Allocate GPU matrix
+## commented to pass the test
+## Kgpu = allocate_matrix(CUSPARSE.CuSparseMatrixCSC{Float32, Int32}, dh)
+## fgpu = CUDA.zeros(Float32, ndofs(dh));
+
+n_cells = dh |> get_grid |> getncells |> Int32
+
+
+# Kernel configuration
+## GPU kernel ##
+## commented to pass the test
+## First init the kernel with the required config.
+## gpu_kernel = init_kernel(BackendCUDA, n_cells, n_basefuncs, assemble_gpu!, (Kgpu, fgpu, dh, cellvalues));
+## Then launch the kernel
+## gpu_kernel |> launch! or gpu_kernel()
+## gpu_kernel()
+
+## CPU kernel ##
+## cpu_kernel = init_kernel(BackendCPU, n_cells, n_basefuncs, assemble_gpu!, (K, f, cellvalues, dh));
+## cpu_kernel()
+
+
+## commented to pass the test
+## norm(Kgpu)
+
+
+## GPU Benchmarking, remove when not needed ##
+## function setup_bench_gpu(n_cells, n_basefuncs, cellvalues, dh)
+##     Kgpu = allocate_matrix(CUSPARSE.CuSparseMatrixCSC{Float32, Int32}, dh)
+##     fgpu = CUDA.zeros(eltype(Kgpu), ndofs(dh));
+##     gpu_kernel = init_kernel(BackendCUDA, n_cells, n_basefuncs, assemble_gpu!, (Kgpu, fgpu, cellvalues, dh))
+## end
+
+## CUDA.@time setup_bench_gpu(n_cells, n_basefuncs, cellvalues, dh)
+## CUDA.@profile trace = true setup_bench_gpu(n_cells, n_basefuncs, cellvalues, dh)
+## CUDA.@time gpu_kernel()
+## CUDA.@profile trace = true gpu_kernel()
+
+
+# ## CPU Benchmarking, remove when not needed ##
+## function setup_bench_cpu( dh)
+##     K = allocate_matrix(SparseMatrixCSC{Float64, Int}, dh)
+##     f = zeros(eltype(K), ndofs(dh));
+##     return K,f
+## end
+
+## using BenchmarkTools
+## @benchmark setup_bench_cpu($dh)
+## K,f = setup_bench_cpu(dh)
+## @benchmark assemble_global_std!($cellvalues, $dh, $K, $f)
+## @benchmark assemble_global_qp!($cellvalues, $dh, $K, $f)
diff --git a/ext/FerriteCuda.jl b/ext/FerriteCuda.jl
new file mode 100644
index 0000000000..47f9d5c8e4
--- /dev/null
+++ b/ext/FerriteCuda.jl
@@ -0,0 +1,21 @@
+module FerriteCuda
+# This module represnets an extenssion of Ferrite.jl that uses GPU backend for assembly, namely CUDA.jl
+
+using Ferrite
+using CUDA
+using Adapt
+using Base:
+    @propagate_inbounds
+using SparseArrays:
+    AbstractSparseArray, SparseMatrixCSC
+using StaticArrays:
+    SVector, MVector
+
+
+include("GPU/gpu_assembler.jl")
+include("GPU/cuda_mem_alloc.jl")
+include("GPU/CUDAKernelLauncher.jl")
+include("GPU/cuda_iterator.jl")
+include("GPU/adapt.jl")
+include("GPU/cuda_sparsity_pattern.jl")
+end
diff --git a/ext/GPU/CUDAKernelLauncher.jl b/ext/GPU/CUDAKernelLauncher.jl
new file mode 100644
index 0000000000..02d5ad85e1
--- /dev/null
+++ b/ext/GPU/CUDAKernelLauncher.jl
@@ -0,0 +1,87 @@
+## This file manifsts the launch of GPU kernel on CUDA backend ##
+abstract type AbstractCudaKernel <: Ferrite.AbstractKernel{BackendCUDA} end
+
+
+struct CudaKernel{MemAlloc <: AbstractCudaMemAlloc, Ti <: Integer} <: AbstractCudaKernel
+    n_cells::Ti
+    n_basefuncs::Ti
+    kernel::Function
+    args::Tuple
+    mem_alloc::MemAlloc
+    threads::Ti
+    blocks::Ti
+end
+
+
+function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
+    if CUDA.functional()
+        threads = convert(Ti, min(n_cells, 256))
+        blocks = _calculate_nblocks(threads, n_cells)
+        adapted_args = _adapt_args(args)
+
+        mem_alloc = try_allocate_shared_mem(Float32, threads, n_basefuncs)
+        mem_alloc isa Nothing || return CudaKernel(n_cells, n_basefuncs, kernel, adapted_args, mem_alloc, threads, blocks)
+
+        # FIXME: mem_alloc adapted twice here and in launch!
+        mem_alloc = allocate_global_mem(Float32, n_cells, n_basefuncs) |> _adapt
+        return CudaKernel(n_cells, n_basefuncs, kernel, adapted_args, mem_alloc, threads, blocks)
+    else
+        throw(ArgumentError("CUDA is not functional, please check your GPU driver and CUDA installation"))
+    end
+end
+
+"""
+    Ferrite.launch!(kernel::LazyKernel{Ti, BackendCUDA}) where {Ti}
+
+Launch a CUDA kernel encapsulated in a `LazyKernel` object.
+
+# Arguments
+- `kernel::LazyKernel`: The kernel to be launched, along with its configuration.
+
+# Returns
+- `nothing`: Indicates that the kernel was launched and synchronized successfully.
+"""
+function Ferrite.launch!(kernel::CudaKernel{SharedMemAlloc{N, M, Tv, Ti}, Ti}) where {N, M, Tv, Ti}
+    ker = kernel.kernel
+    args = kernel.args
+    blocks = kernel.blocks
+    threads = kernel.threads
+    shmem_size = mem_size(kernel.mem_alloc)
+    kwargs = (mem_alloc = kernel.mem_alloc,)
+    kernel_fun = () -> ker(args...; kwargs...)
+    CUDA.@sync @cuda blocks = blocks threads = threads shmem = shmem_size kernel_fun()
+
+    return nothing
+end
+
+
+function Ferrite.launch!(kernel::CudaKernel{GlobalMemAlloc{LOCAL_MATRICES, LOCAL_VECTORS}, Ti}) where {LOCAL_MATRICES, LOCAL_VECTORS, Ti}
+    ker = kernel.kernel
+    args = kernel.args
+    blocks = kernel.blocks
+    threads = kernel.threads
+    kwargs = (mem_alloc = kernel.mem_alloc,)
+    kernel_fun = () -> ker(args...; kwargs...)
+    CUDA.@sync @cuda blocks = blocks threads = threads kernel_fun()
+    return nothing
+end
+
+"""
+    _calculate_nblocks(threads::Integer, n_cells::Integer)
+
+Calculate the number of blocks required for kernel execution.
+
+# Arguments
+- `threads::Integer`: Number of threads per block.
+- `n_cells::Integer`: Total number of cells to process.
+
+# Returns
+- `Integer`: Number of blocks to launch.
+"""
+function _calculate_nblocks(threads::Ti, n_cells::Ti) where {Ti <: Integer}
+    dev = device()
+    no_sms = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
+    required_blocks = cld(n_cells, threads)
+    required_blocks < 2 * no_sms || return convert(Ti, 2 * no_sms)
+    return convert(Ti, required_blocks)
+end
diff --git a/ext/GPU/adapt.jl b/ext/GPU/adapt.jl
new file mode 100644
index 0000000000..8779f1f7d1
--- /dev/null
+++ b/ext/GPU/adapt.jl
@@ -0,0 +1,108 @@
+# This file defines the adapt_structure function, which is used to adapt custom structures to be used on the GPU.
+
+Adapt.@adapt_structure GPUGrid
+Adapt.@adapt_structure GPUDofHandler
+Adapt.@adapt_structure GlobalMemAlloc
+
+function _adapt_args(args)
+    return tuple(((_adapt(arg) for arg in args) |> collect)...)
+end
+
+
+function _adapt(kgpu::CUSPARSE.CuSparseMatrixCSC)
+    # custom adaptation
+    return Adapt.adapt_structure(CUSPARSE.CuSparseDeviceMatrixCSC, kgpu)
+end
+
+
+function _adapt(obj::Any)
+    # fallback to the default implementation
+    return Adapt.adapt_structure(CuArray, obj)
+end
+
+## Adapt GlobalMemAlloc
+function Adapt.adapt_structure(to, mem_alloc::GlobalMemAlloc)
+    kes = Adapt.adapt_structure(to, mem_alloc.Kes)
+    fes = Adapt.adapt_structure(to, mem_alloc.fes)
+    return GlobalMemAlloc(kes, fes)
+end
+
+
+function Adapt.adapt_structure(to, cv::CellValues)
+    fv = Adapt.adapt(to, StaticInterpolationValues(cv.fun_values))
+    gm = Adapt.adapt(to, StaticInterpolationValues(cv.geo_mapping))
+    weights = Adapt.adapt(to, ntuple(i -> getweights(cv.qr)[i], getnquadpoints(cv)))
+    return Ferrite.StaticCellValues(fv, gm, weights)
+end
+
+
+function Adapt.adapt_structure(to, iter::Ferrite.QuadratureValuesIterator)
+    cv = Adapt.adapt_structure(to, iter.v)
+    cell_coords = Adapt.adapt_structure(to, iter.cell_coords)
+    return Ferrite.QuadratureValuesIterator(cv, cell_coords)
+end
+
+function Adapt.adapt_structure(to, qv::Ferrite.StaticQuadratureValues)
+    det = Adapt.adapt_structure(to, qv.detJdV)
+    N = Adapt.adapt_structure(to, qv.N)
+    dNdx = Adapt.adapt_structure(to, qv.dNdx)
+    M = Adapt.adapt_structure(to, qv.M)
+    return Ferrite.StaticQuadratureValues(det, N, dNdx, M)
+end
+function Adapt.adapt_structure(to, qv::StaticQuadratureView)
+    mapping = Adapt.adapt_structure(to, qv.mapping)
+    cell_coords = Adapt.adapt_structure(to, qv.cell_coords |> cu)
+    q_point = Adapt.adapt_structure(to, qv.q_point)
+    cv = Adapt.adapt_structure(to, qv.cv)
+    return StaticQuadratureView(mapping, cell_coords, q_point, cv)
+end
+
+function Adapt.adapt_structure(to, grid::Grid)
+    # map Int64 to Int32 to reduce number of registers
+    cu_cells = grid.cells .|> (x -> Int32.(x.nodes)) .|> Quadrilateral |> cu
+    cells = Adapt.adapt_structure(to, cu_cells)
+    nodes = Adapt.adapt_structure(to, cu(grid.nodes))
+    return GPUGrid(cells, nodes)
+end
+
+
+function Adapt.adapt_structure(to, iterator::CUDACellIterator)
+    grid = Adapt.adapt_structure(to, iterator.grid)
+    dh = Adapt.adapt_structure(to, iterator.dh)
+    ncells = Adapt.adapt_structure(to, iterator.n_cells)
+    return GPUCellIterator(dh, grid, ncells)
+end
+
+
+function _get_ndofs_cell(dh::DofHandler)
+    ndofs_cell = [Int32(Ferrite.ndofs_per_cell(dh, i)) for i in 1:(dh |> Ferrite.get_grid |> Ferrite.getncells)]
+    return ndofs_cell
+end
+
+_symbols_to_int32(symbols) = 1:length(symbols) .|> (sym -> convert(Int32, sym))
+
+function Adapt.adapt_structure(to, sdh::SubDofHandler)
+    cellset = Adapt.adapt_structure(to, sdh.cellset |> collect .|> (x -> convert(Int32, x)) |> cu)
+    field_names = Adapt.adapt_structure(to, _symbols_to_int32(sdh.field_names) |> cu)
+    field_interpolations = sdh.field_interpolations .|> (ip -> Adapt.adapt_structure(to, ip)) |> cu
+    ndofs_per_cell = Adapt.adapt_structure(to, sdh.ndofs_per_cell)
+    return GPUSubDofHandler(cellset, field_names, field_interpolations, ndofs_per_cell)
+end
+
+function Adapt.adapt_structure(to, dh::DofHandler)
+    subdofhandlers = dh.subdofhandlers .|> (sdh -> Adapt.adapt_structure(to, sdh)) |> cu
+    cell_dofs = Adapt.adapt_structure(to, dh.cell_dofs .|> (x -> convert(Int32, x)) |> cu)
+    cells = Adapt.adapt_structure(to, dh.grid.cells |> cu)
+    offsets = Adapt.adapt_structure(to, dh.cell_dofs_offset .|> Int32 |> cu)
+    nodes = Adapt.adapt_structure(to, dh.grid.nodes |> cu)
+    #ndofs_cell = Adapt.adapt_structure(to, _get_ndofs_cell(dh) |> cu)
+    cell_to_subdofhandler = Adapt.adapt_structure(to, dh.cell_to_subdofhandler .|> (x -> convert(Int32, x)) |> cu)
+    return GPUDofHandler(subdofhandlers, cell_dofs, GPUGrid(cells, nodes), offsets, cell_to_subdofhandler)
+end
+
+## TODO: remove this...not needed
+function Adapt.adapt_structure(to, assembler::GPUAssemblerSparsityPattern)
+    K = Adapt.adapt_structure(to, assembler.K)
+    f = Adapt.adapt_structure(to, assembler.f)
+    return Ferrite.GPUAssemblerSparsityPattern(K, f)
+end
diff --git a/ext/GPU/cuda_iterator.jl b/ext/GPU/cuda_iterator.jl
new file mode 100644
index 0000000000..9f193bfedc
--- /dev/null
+++ b/ext/GPU/cuda_iterator.jl
@@ -0,0 +1,188 @@
+##### GPUCellIterator #####
+
+"""
+    AbstractCUDACellIterator <: Ferrite.AbstractKernelCellIterator
+
+Abstract type representing CUDA cell iterators for finite element computations
+on the GPU. It provides the base for implementing multiple cell iteration strategies (e.g. with shared memory, with global memory).
+"""
+abstract type AbstractCUDACellIterator <: Ferrite.AbstractKernelCellIterator end
+
+
+ncells(iterator::AbstractCUDACellIterator) = iterator.n_cells ## any subtype has to have `n_cells` field
+
+
+struct CUDACellIterator{DH <: Ferrite.GPUDofHandler, GRID <: Ferrite.AbstractGPUGrid, Ti <: Integer, MatrixType, VectorType} <: AbstractCUDACellIterator
+    dh::DH
+    grid::GRID
+    n_cells::Ti
+    cell_ke::MatrixType
+    cell_fe::VectorType
+end
+
+struct CudaOutOfBoundCellIterator <: AbstractCUDACellIterator end  # used to handle the case for out of bound threads (global memory only)
+
+
+function Ferrite.CellIterator(dh::Ferrite.GPUDofHandler, buffer_alloc::GlobalMemAlloc)
+    grid = get_grid(dh)
+    n_cells = grid |> getncells |> Int32
+    bd = blockDim().x
+    local_thread_id = threadIdx().x
+    global_thread_id = (blockIdx().x - Int32(1)) * bd + local_thread_id
+    global_thread_id <= n_cells || return CudaOutOfBoundCellIterator()
+    cell_ke = cellke(buffer_alloc, global_thread_id)
+    cell_fe = cellfe(buffer_alloc, global_thread_id)
+    return CUDACellIterator(dh, grid, n_cells, cell_ke, cell_fe)
+end
+
+
+function Ferrite.CellIterator(dh::Ferrite.GPUDofHandler, buffer_alloc::SharedMemAlloc)
+    grid = get_grid(dh)
+    n_cells = grid |> getncells |> Int32
+    block_ke = buffer_alloc.Ke()
+    block_fe = buffer_alloc.fe()
+    local_thread_id = threadIdx().x
+    cell_ke = @view block_ke[local_thread_id, :, :]
+    cell_fe = @view block_fe[local_thread_id, :]
+    return CUDACellIterator(dh, grid, n_cells, cell_ke, cell_fe)
+end
+
+
+function Base.iterate(iterator::CUDACellIterator)
+    i = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
+    i <= ncells(iterator)  || return nothing
+    return (_makecache(iterator, i), i)
+end
+
+
+function Base.iterate(iterator::CUDACellIterator, state)
+    stride = blockDim().x * gridDim().x
+    i = state + stride
+    i <= ncells(iterator) || return nothing
+    return (_makecache(iterator, i), i)
+end
+
+
+Base.iterate(::CudaOutOfBoundCellIterator) = nothing
+
+Base.iterate(::CudaOutOfBoundCellIterator, state) = nothing # I believe this is not necessary
+
+
+struct GPUCellCache{Ti <: Integer, DOFS <: AbstractVector{Ti}, NN, NODES <: SVector{NN, Ti}, X, COORDS <: SVector{X}, MatrixType, VectorType} <: Ferrite.AbstractKernelCellCache
+    coords::COORDS
+    dofs::DOFS
+    cellid::Ti
+    nodes::NODES
+    ke::MatrixType
+    fe::VectorType
+end
+
+
+function _makecache(iterator::AbstractCUDACellIterator, e::Integer)
+    dh = iterator.dh
+    grid = iterator.grid
+    cellid = e
+    cell = Ferrite.getcells(grid, e)
+
+    # Extract the node IDs of the cell.
+    nodes = SVector(convert.(Int32, Ferrite.get_node_ids(cell))...)
+
+    # Extract the degrees of freedom for the cell.
+    dofs = Ferrite.celldofs(dh, e)
+
+    # Get the coordinates of the nodes of the cell.
+    CT = Ferrite.get_coordinate_type(grid)
+    N = Ferrite.nnodes(cell)
+    x = MVector{N, CT}(undef)
+    for i in eachindex(x)
+        x[i] = Ferrite.get_node_coordinate(grid, nodes[i])
+    end
+    coords = SVector(x...)
+
+    # Return the GPUCellCache containing the cell's data.
+    return GPUCellCache(coords, dofs, cellid, nodes, iterator.cell_ke, iterator.cell_fe)
+end
+
+"""
+    getnodes(cc::GPUCellCache)
+
+Return the node IDs associated with the current cell in the cache.
+
+# Arguments
+- `cc`: The `GPUCellCache` object.
+
+# Returns
+The node IDs of the current cell.
+"""
+Ferrite.getnodes(cc::GPUCellCache) = cc.nodes
+
+"""
+    getcoordinates(cc::GPUCellCache)
+
+Return the coordinates of the current cell's nodes.
+
+# Arguments
+- `cc`: The `GPUCellCache` object.
+
+# Returns
+The coordinates of the nodes of the current cell.
+"""
+Ferrite.getcoordinates(cc::GPUCellCache) = cc.coords
+
+"""
+    celldofs(cc::GPUCellCache)
+
+Return the degrees of freedom (DoFs) for the current cell from the cache.
+
+# Arguments
+- `cc`: The `GPUCellCache` object.
+
+# Returns
+The degrees of freedom (DoFs) associated with the current cell.
+"""
+Ferrite.celldofs(cc::GPUCellCache) = cc.dofs
+
+"""
+    cellid(cc::GPUCellCache)
+
+Return the ID of the current cell stored in the cache.
+
+# Arguments
+- `cc`: The `GPUCellCache` object.
+
+# Returns
+The ID of the current cell.
+"""
+Ferrite.cellid(cc::GPUCellCache) = cc.cellid
+
+"""
+    Ferrite.cellke(cc::GPUCellCache)
+
+Access the stiffness matrix (`ke`) of the current cell from shared memory and reset it to zero.
+
+# Arguments
+- `cc`: The `GPUCellCache` object.
+
+# Returns
+The stiffness matrix filled with zeros.
+"""
+@inline function Ferrite.cellke(cc::GPUCellCache)
+    ke = cc.ke
+    return CUDA.fill!(ke, 0.0f0)
+end
+
+"""
+    Ferrite.cellfe(cc::GPUCellCache)
+
+Access the force vector (`fe`) of the current cell from shared memory and reset it to zero.
+
+# Arguments
+- `cc`: The `GPUCellCache` object.
+
+# Returns
+The force vector filled with zeros.
+"""
+@inline function Ferrite.cellfe(cc::GPUCellCache)
+    fe = cc.fe
+    return CUDA.fill!(fe, 0.0f0)
+end
diff --git a/ext/GPU/cuda_mem_alloc.jl b/ext/GPU/cuda_mem_alloc.jl
new file mode 100644
index 0000000000..dc30c2040f
--- /dev/null
+++ b/ext/GPU/cuda_mem_alloc.jl
@@ -0,0 +1,53 @@
+struct DynamicSharedMemFunction{N, Tv <: Real, Ti <: Integer}
+    mem_size::NTuple{N, Ti}
+    offset::Ti
+end
+
+
+function (dsf::DynamicSharedMemFunction{N, Tv, Ti})() where {N, Tv, Ti}
+    mem_size = dsf.mem_size
+    offset = dsf.offset
+    return @cuDynamicSharedMem(Tv, mem_size, offset)
+end
+
+abstract type AbstractCudaMemAlloc <: AbstractMemAlloc end
+
+struct SharedMemAlloc{N, M, Tv <: Real, Ti <: Integer} <: AbstractCudaMemAlloc
+    Ke::DynamicSharedMemFunction{N, Tv, Ti} ## block level allocation (i.e. each block will execute this function)
+    fe::DynamicSharedMemFunction{M, Tv, Ti} ## block level allocation (i.e. each block will execute this function)
+    tot_mem_size::Ti
+end
+
+mem_size(alloc::SharedMemAlloc) = alloc.tot_mem_size
+
+
+function _can_use_dynshmem(required_shmem::Integer)
+    dev = device()
+    MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
+    return required_shmem < MAX_DYN_SHMEM
+end
+
+
+function try_allocate_shared_mem(::Type{Tv}, block_dim::Ti, n_basefuncs::Ti) where {Ti <: Integer, Tv <: Real}
+    shared_mem = convert(Ti, sizeof(Tv) * (block_dim) * (n_basefuncs) * n_basefuncs + sizeof(Tv) * (block_dim) * n_basefuncs)
+    _can_use_dynshmem(shared_mem) || return nothing
+    Ke = DynamicSharedMemFunction{3, Tv, Ti}((block_dim, n_basefuncs, n_basefuncs), convert(Ti, 0))
+    fe = DynamicSharedMemFunction{2, Tv, Ti}((block_dim, n_basefuncs), convert(Ti, sizeof(Tv) * block_dim * n_basefuncs * n_basefuncs))
+    return SharedMemAlloc(Ke, fe, shared_mem)
+end
+
+
+struct GlobalMemAlloc{LOCAL_MATRICES, LOCAL_VECTORS} <: AbstractCudaMemAlloc
+    Kes::LOCAL_MATRICES ## global level allocation (i.e. memory for all blocks -> 3rd order tensor)
+    fes::LOCAL_VECTORS  ## global level allocation (i.e. memory for all blocks -> 2nd order tensor)
+end
+
+cellke(alloc::GlobalMemAlloc, e::Ti) where {Ti <: Integer} = @view alloc.Kes[e, :, :]
+cellfe(alloc::GlobalMemAlloc, e::Ti) where {Ti <: Integer} = @view alloc.fes[e, :]
+
+
+function allocate_global_mem(::Type{Tv}, n_cells::Ti, n_basefuncs::Ti) where {Ti <: Integer, Tv <: Real}
+    Kes = CUDA.zeros(Tv, n_cells, n_basefuncs, n_basefuncs)
+    fes = CUDA.zeros(Tv, n_cells, n_basefuncs)
+    return GlobalMemAlloc(Kes, fes)
+end
diff --git a/ext/GPU/cuda_sparsity_pattern.jl b/ext/GPU/cuda_sparsity_pattern.jl
new file mode 100644
index 0000000000..d397d96688
--- /dev/null
+++ b/ext/GPU/cuda_sparsity_pattern.jl
@@ -0,0 +1,6 @@
+function Ferrite.allocate_matrix(::Type{S}, dh::DofHandler, args...; kwargs...) where {Tv, Ti <: Integer, S <: CUSPARSE.CuSparseMatrixCSC{Tv, Ti}}
+    ## TODO: decide whether create the matrix from the very beginning or just create cpu version then copy
+    K = allocate_matrix(SparseMatrixCSC{Tv, Ti}, dh, args...; kwargs...)
+    Kgpu = CUSPARSE.CuSparseMatrixCSC(K)
+    return Kgpu
+end
diff --git a/ext/GPU/gpu_assembler.jl b/ext/GPU/gpu_assembler.jl
new file mode 100644
index 0000000000..0fde157992
--- /dev/null
+++ b/ext/GPU/gpu_assembler.jl
@@ -0,0 +1,77 @@
+## GPU Assembler ###
+### First abstract types and interfaces ###
+
+abstract type AbstractGPUSparseAssembler{Tv, Ti} end
+
+function Ferrite.assemble!(A::AbstractGPUSparseAssembler, dofs::AbstractVector{Int32}, Ke::MATRIX, fe::VECTOR) where {MATRIX, VECTOR}
+    throw(NotImplementedError("A concrete implementation of assemble! is required"))
+end
+
+
+struct GPUAssemblerSparsityPattern{Tv, Ti, VEC_FLOAT <: AbstractVector{Tv}, SPARSE_MAT <: AbstractSparseArray{Tv, Ti}} <: AbstractGPUSparseAssembler{Tv, Ti}
+    K::SPARSE_MAT
+    f::VEC_FLOAT
+end
+
+function Ferrite.start_assemble(K::CUSPARSE.CuSparseDeviceMatrixCSC{Tv, Ti}, f::CuDeviceVector{Tv}; fillzero::Bool = false) where {Tv, Ti}
+    ## `fillzero` is not required in GPU version and should be always set to false, because different threads will
+    ## instantiate their own instance of assembler, so it it's set to true, the results will be wrong.
+    ## However, the interface is kept the same for consistency with the CPU version (in CPU multithreading we use the standard assembler).
+    fillzero && (fillzero!(K); fillzero!(f))
+    return GPUAssemblerSparsityPattern(K, f)
+end
+
+
+"""
+    assemble!(A::GPUAssemblerSparsityPattern, dofs::AbstractVector{Int32}, Ke::MATRIX, fe::VECTOR)
+
+Assembles the global stiffness matrix `Ke` and the global force vector `fe` into the the global stiffness matrix `K` and the global force vector `f` of the `GPUAssemblerSparsityPattern` object `A`.
+
+"""
+@propagate_inbounds function Ferrite.assemble!(A::GPUAssemblerSparsityPattern, dofs::AbstractVector{Int32}, Ke::MATRIX, fe::VECTOR) where {MATRIX, VECTOR}
+    # Note: MATRIX and VECTOR are cuda dynamic shared memory
+    return _assemble!(A, dofs, Ke, fe)
+end
+
+
+function _assemble!(A::GPUAssemblerSparsityPattern, dofs::AbstractVector{Int32}, Ke::MATRIX, fe::VECTOR) where {MATRIX, VECTOR}
+    # # Brute force assembly
+    K = A.K
+    f = A.f
+    for i in 1:length(dofs)
+        ig = dofs[i]
+        CUDA.@atomic f[ig] += fe[i]
+        for j in 1:length(dofs)
+            jg = dofs[j]
+            # set the value of the global matrix
+            _add_to_index!(K, Ke[i, j], ig, jg)
+        end
+    end
+    return
+end
+
+@inline function _add_to_index!(K::AbstractSparseArray{Tv, Ti}, v::Tv, i::Int32, j::Int32) where {Tv, Ti}
+    col_start = K.colPtr[j]
+    col_end = K.colPtr[j + Int32(1)] - Int32(1)
+
+    for k in col_start:col_end
+        if K.rowVal[k] == i
+            # Update the existing element
+            CUDA.@atomic K.nzVal[k] += v
+            return
+        end
+    end
+    return
+end
+
+
+### Array utils ###
+function fillzero!(A::CuDeviceVector{Tv}) where {Tv}
+    CUDA.fill!(A, zero(Tv))
+    return A
+end
+
+function fillzero!(A::CUSPARSE.CuSparseDeviceMatrixCSC{Tv, Ti}) where {Tv, Ti}
+    CUDA.fill!(A.nzVal, zero(Tv))
+    return A
+end
diff --git a/heatflow_qp_values.jl b/heatflow_qp_values.jl
new file mode 100644
index 0000000000..e6de08232f
--- /dev/null
+++ b/heatflow_qp_values.jl
@@ -0,0 +1,177 @@
+using Ferrite
+
+# Standard element routine
+function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues)
+    n_basefuncs = getnbasefunctions(cellvalues)
+    ## Loop over quadrature points
+    for q_point in 1:getnquadpoints(cellvalues)
+        ## Get the quadrature weight
+        dΩ = getdetJdV(cellvalues, q_point)
+        ## Loop over test shape functions
+        for i in 1:n_basefuncs
+            δu = shape_value(cellvalues, q_point, i)
+            ∇δu = shape_gradient(cellvalues, q_point, i)
+            ## Add contribution to fe
+            fe[i] += δu * dΩ
+            ## Loop over trial shape functions
+            for j in 1:n_basefuncs
+                ∇u = shape_gradient(cellvalues, q_point, j)
+                ## Add contribution to Ke
+                Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
+            end
+        end
+    end
+    return Ke, fe
+end
+
+# Element routine using QuadratureValuesIterator
+function assemble_element_qpiter!(Ke::Matrix, fe::Vector, cellvalues)
+    n_basefuncs = getnbasefunctions(cellvalues)
+    ## Loop over quadrature points
+    for qv in Ferrite.QuadratureValuesIterator(cellvalues)
+        ## Get the quadrature weight
+        dΩ = getdetJdV(qv)
+        ## Loop over test shape functions
+        for i in 1:n_basefuncs
+            δu = shape_value(qv, i)
+            ∇δu = shape_gradient(qv, i)
+            ## Add contribution to fe
+            fe[i] += δu * dΩ
+            ## Loop over trial shape functions
+            for j in 1:n_basefuncs
+                ∇u = shape_gradient(qv, j)
+                ## Add contribution to Ke
+                Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
+            end
+        end
+    end
+    return Ke, fe
+end
+
+function assemble_element_qpiter!(Ke::Matrix, fe::Vector, cellvalues, cell_coords::AbstractVector)
+    n_basefuncs = getnbasefunctions(cellvalues)
+    ## Loop over quadrature points
+    for qv in Ferrite.QuadratureValuesIterator(cellvalues, cell_coords)
+        ## Get the quadrature weight
+        dΩ = getdetJdV(qv)
+        ## Loop over test shape functions
+        for i in 1:n_basefuncs
+            δu = shape_value(qv, i)
+            ∇δu = shape_gradient(qv, i)
+            ## Add contribution to fe
+            fe[i] += δu * dΩ
+            ## Loop over trial shape functions
+            for j in 1:n_basefuncs
+                ∇u = shape_gradient(qv, j)
+                ## Add contribution to Ke
+                Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
+            end
+        end
+    end
+    return Ke, fe
+end
+
+function assemble_global(cellvalues, dh; kwargs...)
+    return assemble_global!(create_buffers(cellvalues, dh), cellvalues, dh; kwargs...)
+end
+
+function assemble_global!(buffer, cellvalues, dh::DofHandler; qp_iter::Val{QPiter}, reinit::Val{ReInit}) where {QPiter, ReInit}
+    (; f, K, assembler, Ke, fe) = buffer
+    for cell in CellIterator(dh)
+        fill!(Ke, 0)
+        fill!(fe, 0)
+        if QPiter
+            if ReInit
+                reinit!(cellvalues, getcoordinates(cell))
+                assemble_element_qpiter!(Ke, fe, cellvalues)
+            else
+                assemble_element_qpiter!(Ke, fe, cellvalues, getcoordinates(cell))
+            end
+        else
+            reinit!(cellvalues, getcoordinates(cell))
+            assemble_element_std!(Ke, fe, cellvalues)
+        end
+        assemble!(assembler, celldofs(cell), Ke, fe)
+    end
+    return K, f
+end
+
+function create_buffers(cellvalues, dh)
+    f = zeros(ndofs(dh))
+    K = create_sparsity_pattern(dh)
+    assembler = start_assemble(K, f)
+    ## Local quantities
+    n_basefuncs = getnbasefunctions(cellvalues)
+    Ke = zeros(n_basefuncs, n_basefuncs)
+    fe = zeros(n_basefuncs)
+    return (; f, K, assembler, Ke, fe)
+end
+
+n = 50
+grid = generate_grid(Quadrilateral, (n, n));
+ip = Lagrange{RefQuadrilateral, 1}()
+qr = QuadratureRule{RefQuadrilateral}(2)
+
+dh = DofHandler(grid)
+
+add!(dh, :u, ip)
+close!(dh);
+
+cellvalues = CellValues(qr, ip);
+
+static_cellvalues = Ferrite.StaticCellValues(cellvalues)
+
+stdassy(buffer, cv, dh) = assemble_global!(buffer, cv, dh; qp_iter = Val(false), reinit = Val(false))
+qp_outside(buffer, cv, dh) = assemble_global!(buffer, cv, dh; qp_iter = Val(true), reinit = Val(true))
+qp_inside(buffer, cv, dh) = assemble_global!(buffer, cv, dh; qp_iter = Val(true), reinit = Val(false))
+
+Kstd, fstd = stdassy(create_buffers(cellvalues, dh), cellvalues, dh);
+using LinearAlgebra
+norm(Kstd)
+K_qp_o, f_qp_o = qp_outside(create_buffers(cellvalues, dh), cellvalues, dh);
+norm(K_qp_o)
+K_qp_i, f_qp_i = qp_inside(create_buffers(cellvalues, dh), cellvalues, dh);
+norm(K_qp_i)
+cvs_o = Ferrite.StaticCellValues(cellvalues, Val(true)) # Save cell_coords in cvs_o
+Ks_o, fs_o = qp_outside(create_buffers(cvs_o, dh), cvs_o, dh);
+norm(Ks_o)
+cvs_i = Ferrite.StaticCellValues(cellvalues, Val(false)) # Don't save cell_coords in cvs_o
+Ks_i, fs_i = qp_inside(create_buffers(cvs_i, dh), cvs_i, dh);
+norm(Ks_i)
+using Test
+@testset "check outputs" begin
+    for (k, K, f) in (("qpo", K_qp_o, f_qp_o), ("qpi", K_qp_i, f_qp_i), ("so", Ks_o, fs_o), ("si", Ks_i, fs_i))
+        @testset "$k" begin
+            @test K ≈ Kstd
+            @test f ≈ fstd
+        end
+    end
+end
+
+# Benchmarking
+using BenchmarkTools
+if n ≤ 100
+    print("Standard: ")
+    @btime stdassy(buffer, $cellvalues, $dh) setup = (buffer = create_buffers(cellvalues, dh))
+    print("Std qpoint outside: ")
+    @btime qp_outside(buffer, $cellvalues, $dh) setup = (buffer = create_buffers(cellvalues, dh))
+    print("Std qpoint inside: ")
+    @btime qp_inside(buffer, $cellvalues, $dh) setup = (buffer = create_buffers(cellvalues, dh))
+    print("Static outside: ")
+    @btime qp_outside(buffer, $cvs_o, $dh) setup = (buffer = create_buffers(cvs_o, dh))
+    print("Static inside: ")
+    @btime qp_inside(buffer, $cvs_i, $dh) setup = (buffer = create_buffers(cvs_i, dh))
+else
+    buffer = create_buffers(cellvalues, dh)
+    print("Standard: ")
+    @time stdassy(buffer, cellvalues, dh)
+    print("Std qpoint outside: ")
+    @time qp_outside(buffer, cellvalues, dh)
+    print("Std qpoint inside: ")
+    @time qp_inside(buffer, cellvalues, dh)
+    print("Static outside: ")
+    @time qp_outside(buffer, cvs_o, dh)
+    print("Static inside: ")
+    @time qp_inside(buffer, cvs_i, dh)
+end
+nothing
diff --git a/src/FEValues/QuadratureValues.jl b/src/FEValues/QuadratureValues.jl
new file mode 100644
index 0000000000..888dc3f3d0
--- /dev/null
+++ b/src/FEValues/QuadratureValues.jl
@@ -0,0 +1,141 @@
+# QuadratureValuesIterator
+struct QuadratureValuesIterator{VT, XT}
+    v::VT
+    cell_coords::XT # Union{AbstractArray{<:Vec}, Nothing}
+    function QuadratureValuesIterator(v::V) where {V}
+        return new{V, Nothing}(v, nothing)
+    end
+    function QuadratureValuesIterator(v::V, cell_coords::VT) where {V, VT <: AbstractArray}
+        #reinit!(v, cell_coords) # Why we need that ?
+        return new{V, VT}(v, cell_coords)
+    end
+end
+
+function Base.iterate(iterator::QuadratureValuesIterator{<:Any, Nothing}, q_point = 1)
+    checkbounds(Bool, 1:getnquadpoints(iterator.v), q_point) || return nothing
+    qp_v = @inbounds quadrature_point_values(iterator.v, q_point)
+    return (qp_v, q_point + 1)
+end
+
+function Base.iterate(iterator::QuadratureValuesIterator{<:Any, <:AbstractVector}, q_point = 1)
+    checkbounds(Bool, 1:getnquadpoints(iterator.v), q_point) || return nothing
+    #q_point < 5 || return nothing
+    qp_v = @inbounds quadrature_point_values(iterator.v, q_point, iterator.cell_coords)
+    return (qp_v, q_point + 1)
+    #return (1, q_point+1)
+end
+Base.IteratorEltype(::Type{<:QuadratureValuesIterator}) = Base.EltypeUnknown()
+Base.length(iterator::QuadratureValuesIterator) = getnquadpoints(iterator.v)
+
+# AbstractQuadratureValues
+abstract type AbstractQuadratureValues end
+
+function function_value(qp_v::AbstractQuadratureValues, u::AbstractVector, dof_range = eachindex(u))
+    n_base_funcs = getnbasefunctions(qp_v)
+    length(dof_range) == n_base_funcs || throw_incompatible_dof_length(length(dof_range), n_base_funcs)
+    @boundscheck checkbounds(u, dof_range)
+    val = function_value_init(qp_v, u)
+    @inbounds for (i, j) in pairs(dof_range)
+        val += shape_value(qp_v, i) * u[j]
+    end
+    return val
+end
+
+function function_gradient(qp_v::AbstractQuadratureValues, u::AbstractVector, dof_range = eachindex(u))
+    n_base_funcs = getnbasefunctions(qp_v)
+    length(dof_range) == n_base_funcs || throw_incompatible_dof_length(length(dof_range), n_base_funcs)
+    @boundscheck checkbounds(u, dof_range)
+    grad = function_gradient_init(qp_v, u)
+    @inbounds for (i, j) in pairs(dof_range)
+        grad += shape_gradient(qp_v, i) * u[j]
+    end
+    return grad
+end
+
+function function_symmetric_gradient(qp_v::AbstractQuadratureValues, u::AbstractVector, dof_range)
+    grad = function_gradient(qp_v, u, dof_range)
+    return symmetric(grad)
+end
+
+function function_symmetric_gradient(qp_v::AbstractQuadratureValues, u::AbstractVector)
+    grad = function_gradient(qp_v, u)
+    return symmetric(grad)
+end
+
+function function_divergence(qp_v::AbstractQuadratureValues, u::AbstractVector, dof_range = eachindex(u))
+    return divergence_from_gradient(function_gradient(qp_v, u, dof_range))
+end
+
+function function_curl(qp_v::AbstractQuadratureValues, u::AbstractVector, dof_range = eachindex(u))
+    return curl_from_gradient(function_gradient(qp_v, u, dof_range))
+end
+
+function spatial_coordinate(qp_v::AbstractQuadratureValues, x::AbstractVector{<:Vec})
+    n_base_funcs = getngeobasefunctions(qp_v)
+    length(x) == n_base_funcs || throw_incompatible_coord_length(length(x), n_base_funcs)
+    vec = zero(eltype(x))
+    @inbounds for i in 1:n_base_funcs
+        vec += geometric_value(qp_v, i) * x[i]
+    end
+    return vec
+end
+
+# Specific design for QuadratureValues <: AbstractQuadratureValues
+# which contains standard AbstractValues
+struct QuadratureValues{VT <: AbstractValues} <: AbstractQuadratureValues
+    v::VT
+    q_point::Int
+    Base.@propagate_inbounds function QuadratureValues(v::AbstractValues, q_point::Int)
+        @boundscheck checkbounds(1:getnbasefunctions(v), q_point)
+        return new{typeof(v)}(v, q_point)
+    end
+end
+
+@inline quadrature_point_values(fe_v::AbstractValues, q_point, args...) = QuadratureValues(fe_v, q_point)
+
+@propagate_inbounds getngeobasefunctions(qv::QuadratureValues) = getngeobasefunctions(qv.v)
+@propagate_inbounds geometric_value(qv::QuadratureValues, i) = geometric_value(qv.v, qv.q_point, i)
+geometric_interpolation(qv::QuadratureValues) = geometric_interpolation(qv.v)
+
+getdetJdV(qv::QuadratureValues) = @inbounds getdetJdV(qv.v, qv.q_point)
+
+# Accessors for function values
+getnbasefunctions(qv::QuadratureValues) = getnbasefunctions(qv.v)
+function_interpolation(qv::QuadratureValues) = function_interpolation(qv.v)
+function_difforder(qv::QuadratureValues) = function_difforder(qv.v)
+shape_value_type(qv::QuadratureValues) = shape_value_type(qv.v)
+shape_gradient_type(qv::QuadratureValues) = shape_gradient_type(qv.v)
+
+@propagate_inbounds shape_value(qv::QuadratureValues, i::Int) = shape_value(qv.v, qv.q_point, i)
+@propagate_inbounds shape_gradient(qv::QuadratureValues, i::Int) = shape_gradient(qv.v, qv.q_point, i)
+@propagate_inbounds shape_symmetric_gradient(qv::QuadratureValues, i::Int) = shape_symmetric_gradient(qv.v, qv.q_point, i)
+
+
+#= Proposed syntax, for heatflow in general
+function assemble_element!(Ke::Matrix, fe::Vector, cellvalues)
+    n_basefuncs = getnbasefunctions(cellvalues)
+    for qv in Ferrite.QuadratureValuesIterator(cellvalues)
+        dΩ = getdetJdV(qv)
+        for i in 1:n_basefuncs
+            δu  = shape_value(qv, i)
+            ∇δu = shape_gradient(qv, i)
+            fe[i] += δu * dΩ
+            for j in 1:n_basefuncs
+                ∇u = shape_gradient(qv, j)
+                Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
+            end
+        end
+    end
+    return Ke, fe
+end
+
+Where the default for a QuadratureValuesIterator would be to return a
+`QuadratureValues` as above, but custom `AbstractValues` can be created where
+for example the element type would be a static QuadPointValue type which doesn't
+use heap allocated buffers, e.g. by only saving the cell and coordinates during reinit,
+and then calculating all values for each element in the iterator.
+
+References:
+https://github.com/termi-official/Thunderbolt.jl/pull/53/files#diff-2b486be5a947c02ef2a38ff3f82af3141193af0b6f01ed9d5129b914ed1d84f6
+https://github.com/Ferrite-FEM/Ferrite.jl/compare/master...kam/StaticValues2
+=#
diff --git a/src/FEValues/StaticCellValues.jl b/src/FEValues/StaticCellValues.jl
new file mode 100644
index 0000000000..5ce13996d3
--- /dev/null
+++ b/src/FEValues/StaticCellValues.jl
@@ -0,0 +1,165 @@
+struct StaticQuadratureValues{T, N_t, dNdx_t, M_t, NumN, NumM} <: AbstractQuadratureValues
+    detJdV::T
+    N::SVector{NumN, N_t}
+    dNdx::SVector{NumN, dNdx_t}
+    M::SVector{NumM, M_t}
+end
+
+@propagate_inbounds getngeobasefunctions(qv::StaticQuadratureValues) = length(qv.M)
+@propagate_inbounds geometric_value(qv::StaticQuadratureValues, i) = qv.M[i]
+# geometric_interpolation(qv::StaticQuadratureValues) = geometric_interpolation(qv.v) # Not included
+
+getdetJdV(qv::StaticQuadratureValues) = qv.detJdV
+
+# Accessors for function values
+getnbasefunctions(qv::StaticQuadratureValues) = length(qv.N)
+# function_interpolation(qv::StaticQuadratureValues) = function_interpolation(qv.v) # Not included
+shape_value_type(::StaticQuadratureValues{<:Any, N_t}) where {N_t} = N_t
+shape_gradient_type(::StaticQuadratureValues{<:Any, <:Any, dNdx_t}) where {dNdx_t} = dNdx_t
+
+@propagate_inbounds shape_value(qv::StaticQuadratureValues, i::Int) = qv.N[i]
+@propagate_inbounds shape_gradient(qv::StaticQuadratureValues, i::Int) = qv.dNdx[i]
+@propagate_inbounds shape_gradient(qv::StaticQuadratureValues, i::Int32) = qv.dNdx[i] # Needed for GPU (threadIdx is Int32), otherwise it will throw a dynamic function invokation error
+@propagate_inbounds shape_symmetric_gradient(qv::StaticQuadratureValues, i::Int) = symmetric(qv.dNdx[i])
+
+@propagate_inbounds geometric_value(qv::StaticQuadratureValues, i::Int) = qv.M[i]
+
+# StaticInterpolationValues: interpolation and precalculated values for all quadrature points
+# Can be both for function and geometric shape functions.
+# DiffOrder parameter?
+# TODO: Could perhaps denote this just InterpolationValues and replace GeometryMapping
+# Just need to make Nξ::AbstractMatrix instead as in GeometryMapping to make it equivalent (except fieldnames)
+struct StaticInterpolationValues{IP, N, Nqp, N_et, dNdξ_t, Nall}
+    ip::IP
+    Nξ::SMatrix{N, Nqp, N_et, Nall}
+    dNdξ::dNdξ_t        # Union{SMatrix{N, Nqp}, Nothing}
+    #dN2dξ2::dN2dξ2_t   # Union{SMatrix{N, Nqp}, Nothing}
+end
+function StaticInterpolationValues(fv::FunctionValues)
+    N = getnbasefunctions(fv.ip)
+    Nq = size(fv.Nξ, 2)
+    Nξ = SMatrix{N, Nq}(fv.Nξ)
+    dNdξ = SMatrix{N, Nq}(fv.dNdξ)
+    return StaticInterpolationValues(fv.ip, Nξ, dNdξ)
+end
+function StaticInterpolationValues(gm::GeometryMapping)
+    N = getnbasefunctions(gm.ip)
+    Nq = size(gm.M, 2)
+    M = SMatrix{N, Nq}(gm.M)
+    dMdξ = SMatrix{N, Nq}(gm.dMdξ)
+    return StaticInterpolationValues(gm.ip, M, dMdξ)
+end
+
+getnbasefunctions(siv::StaticInterpolationValues) = getnbasefunctions(siv.ip)
+
+# Dispatch on DiffOrder parameter?
+# Reuse functions for GeometryMapping - same signature but need access functions
+# Or merge GeometryMapping and StaticInterpolationValues => InterpolationValues
+@propagate_inbounds @inline function calculate_mapping(ip_values::StaticInterpolationValues{<:Any, N}, q_point, x) where {N}
+    fecv_J = zero(otimes_returntype(eltype(x), eltype(ip_values.dNdξ)))
+    @inbounds for j in 1:N
+        #fecv_J += x[j] ⊗ geo_mapping.dMdξ[j, q_point]
+        fecv_J += otimes_helper(x[j], ip_values.dNdξ[j, q_point])
+    end
+    return MappingValues(fecv_J, nothing)
+end
+
+@propagate_inbounds @inline function calculate_mapped_values(funvals::StaticInterpolationValues, q_point, mapping_values, args...)
+    return calculate_mapped_values(funvals, mapping_type(funvals.ip), q_point, mapping_values, args...)
+end
+
+@propagate_inbounds @inline function calculate_mapped_values(funvals::StaticInterpolationValues, ::IdentityMapping, q_point, mapping_values, args...)
+    Jinv = calculate_Jinv(getjacobian(mapping_values))
+    Nx = funvals.Nξ[:, q_point]
+    dNdx = map(dNdξ -> dothelper(dNdξ, Jinv), funvals.dNdξ[:, q_point])
+    return Nx, dNdx
+end
+
+struct StaticCellValues{FV, GM, Nqp, T}
+    fv::FV # StaticInterpolationValues
+    gm::GM # StaticInterpolationValues
+    #x::Tx  # AbstractVector{<:Vec} or Nothing
+    weights::NTuple{Nqp, T}
+end
+function StaticCellValues(cv::CellValues)
+    fv = StaticInterpolationValues(cv.fun_values)
+    gm = StaticInterpolationValues(cv.geo_mapping)
+    sdim = sdim_from_gradtype(shape_gradient_type(cv))
+    #x = SaveCoords ? fill(zero(Vec{sdim}), getngeobasefunctions(cv)) : nothing
+    weights = ntuple(i -> getweights(cv.qr)[i], getnquadpoints(cv))
+    return StaticCellValues(fv, gm, weights)
+end
+
+getnquadpoints(cv::StaticCellValues) = length(cv.weights)
+getnbasefunctions(cv::StaticCellValues) = getnbasefunctions(cv.fv)
+getngeobasefunctions(cv::StaticCellValues) = getnbasefunctions(cv.gm)
+
+@inline function reinit!(cv::StaticCellValues{<:Any, <:Any, <:AbstractVector}, cell_coords::AbstractVector)
+    return copyto!(cv.x, cell_coords)
+    #TODO: Also allow the cell::AbstracCell to be given and updated
+end
+@inline function reinit!(::StaticCellValues{<:Any, <:Any, Nothing}, ::AbstractVector)
+    return nothing # Nothing to do on reinit if x is not saved.
+end
+
+@inline function quadrature_point_values(fe_v::StaticCellValues{<:Any, <:Any, <:AbstractVector}, q_point::Int)
+    return _quadrature_point_values(fe_v, q_point, fe_v.x, detJ -> throw_detJ_not_pos(detJ))
+end
+
+@inline function quadrature_point_values(fe_v::StaticCellValues{<:Any, <:Any}, q_point::Int, cell_coords::AbstractVector)
+    return _quadrature_point_values(fe_v, q_point, cell_coords, detJ -> throw_detJ_not_pos(detJ))
+end
+
+@inline function quadrature_point_values(fe_v::StaticCellValues{<:Any, <:Any}, q_point::Int, cell_coords::StaticVector)
+    return _quadrature_point_values(fe_v, q_point, cell_coords, detJ -> -1)
+end
+
+
+function _quadrature_point_values(fe_v::StaticCellValues, q_point::Int, cell_coords::AbstractVector, neg_detJ_err_fun::Function)
+    #q_point bounds checked, ok to use @inbounds
+    @inbounds begin
+        mapping = calculate_mapping(fe_v.gm, q_point, cell_coords)
+
+        detJ = calculate_detJ(getjacobian(mapping))
+        detJ > 0.0f0 || neg_detJ_err_fun(detJ) # Cannot throw error on GPU, TODO: return error code instead
+        detJdV = detJ * fe_v.weights[q_point]
+
+        Nx, dNdx = calculate_mapped_values(fe_v.fv, q_point, mapping)
+        M = fe_v.gm.Nξ[:, q_point]
+    end
+    return StaticQuadratureValues(detJdV, Nx, dNdx, M)
+end
+
+
+## New Implementation for StaticQuadratureValues to suit the GPU code
+struct StaticQuadratureView{JT, HT, N_Nodes, NODEVEC} <: AbstractQuadratureValues
+    mapping::MappingValues{JT, HT}
+    cell_coords::SVector{N_Nodes, NODEVEC}
+    q_point::Int32
+    cv::StaticCellValues
+end
+
+
+@inline function quadrature_point_values(fe_v::StaticCellValues{<:Any, <:Any}, q_point::Int32, cell_coords::SVector)
+    @inbounds begin
+        mapping = calculate_mapping(fe_v.gm, q_point, cell_coords)
+    end
+    return StaticQuadratureView(mapping, cell_coords, q_point, fe_v)
+end
+
+
+@inline function getdetJdV(qv::StaticQuadratureView)
+    detJ = Float32(calculate_detJ(getjacobian(qv.mapping)))
+    detJ > 0.0f0 || -1.0f0 # Cannot throw error on GPU, TODO: return error code instead
+    detJdV = detJ * qv.cv.weights[qv.q_point]
+    return detJdV
+end
+
+@inline function shape_gradient(qv::StaticQuadratureView, i::Int32)
+    @inbounds begin
+        Jinv = calculate_Jinv(getjacobian(qv.mapping))
+        #Nx = qv.cv.fv.Nξ[i, q_point]
+        dNdx = dothelper(qv.cv.fv.dNdξ[i, qv.q_point], Jinv)
+        return dNdx
+    end
+end
diff --git a/src/Ferrite.jl b/src/Ferrite.jl
index 7b358463be..1812851c1d 100644
--- a/src/Ferrite.jl
+++ b/src/Ferrite.jl
@@ -16,7 +16,7 @@ using OrderedCollections:
 using SparseArrays:
     SparseArrays, SparseMatrixCSC, nonzeros, nzrange, rowvals, AbstractSparseMatrixCSC
 using StaticArrays:
-    StaticArrays, MArray, MMatrix, SArray, SMatrix, SVector
+    StaticVector, StaticArrays, MVector, MArray, MMatrix, SArray, SMatrix, SVector, @SVector
 using WriteVTK:
     WriteVTK, VTKCellTypes
 using Tensors:
@@ -130,6 +130,8 @@ include("FEValues/CellValues.jl")
 include("FEValues/FacetValues.jl")
 include("FEValues/InterfaceValues.jl")
 include("FEValues/PointValues.jl")
+include("FEValues/QuadratureValues.jl")
+include("FEValues/StaticCellValues.jl")
 include("FEValues/common_values.jl")
 include("FEValues/face_integrals.jl")
 
@@ -166,4 +168,14 @@ include("PointEvalHandler.jl")
 include("deprecations.jl")
 include("docs.jl")
 
+# GPU support
+include("GPU/coloring_dof.jl")
+include("GPU/KernelLauncher.jl")
+include("GPU/CPUKernelLauncher.jl")
+include("GPU/gpu_grid.jl")
+include("GPU/GPUDofHandler.jl")
+include("GPU/parallel_iterator.jl")
+include("GPU/mem_alloc.jl")
+
+
 end # module
diff --git a/src/GPU/CPUKernelLauncher.jl b/src/GPU/CPUKernelLauncher.jl
new file mode 100644
index 0000000000..1a3f92ad00
--- /dev/null
+++ b/src/GPU/CPUKernelLauncher.jl
@@ -0,0 +1,66 @@
+abstract type AbstractCPUKernel <: AbstractKernel{BackendCPU} end
+
+
+struct CPUKernel{Ti} <: AbstractCPUKernel
+    n_cells::Ti               # Number of cells
+    n_basefuncs::Ti           # Number of base functions
+    kernel::Function          # Kernel function to execute
+    args::Tuple               # Arguments for the kernel function
+    n_colors::Ti
+    dh::ColoringDofHandler
+end
+
+function init_kernel(::Type{BackendCPU}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
+    args, color_dh = _to_colordh(args) # convert the dofhandler to color colordofhandler
+    no_colors = ncolors(color_dh)
+    return CPUKernel(n_cells, n_basefuncs, kernel, args, no_colors, color_dh)
+end
+
+function launch!(kernel::CPUKernel{Ti}) where {Ti}
+    ker = kernel.kernel
+    args = kernel.args
+    ## Naive implementation to circumvent the issue with cellvalues
+    ## on GPU the we are using the static version of cellvalues because it's immutable
+    ## so in order to unify the parallel kernel interface we need (for now) to use the static version
+    ## without changing the routine, so basically we search for any cellvalues passed in the args and
+    ## convert it to the static version
+    cell_index = findfirst(x -> x isa CellValues, args)
+    (cell_index === nothing) || (args = _update_cell_args(args, cell_index))
+    color_dh = kernel.dh
+    no_colors = kernel.n_colors
+    nthreads = Threads.nthreads()
+    for i in 1:no_colors
+        current_color!(color_dh, i)
+        Threads.@threads :static for j in 1:nthreads
+            ker(args...)
+        end
+    end
+    return
+end
+
+(ker::CPUKernel)() = launch!(ker)
+
+function _to_colordh(args::Tuple)
+    dh_index = findfirst(x -> x isa AbstractDofHandler, args)
+    dh_index !== nothing || throw(ErrorException("No subtype of AbstractDofHandler found in the arguments"))
+    arr = args |> collect
+    color_dh = init_colordh(arr[dh_index])
+    arr[dh_index] = color_dh
+    return Tuple(arr), color_dh
+end
+
+function _update_cell_args(args::Tuple, index::Int)
+    ## since tuples are immutable we need to convert it to an array to update the values
+    ## then convert it back to a tuple
+    arr = args |> collect
+    arr[index] = _to_static_cellvalues(arr[index])
+    return Tuple(arr)
+end
+
+
+function _to_static_cellvalues(cv::CellValues)
+    fv = StaticInterpolationValues(cv.fun_values)
+    gm = StaticInterpolationValues(cv.geo_mapping)
+    weights = ntuple(i -> getweights(cv.qr)[i], getnquadpoints(cv))
+    return StaticCellValues(fv, gm, weights)
+end
diff --git a/src/GPU/GPUDofHandler.jl b/src/GPU/GPUDofHandler.jl
new file mode 100644
index 0000000000..dbfe8d1217
--- /dev/null
+++ b/src/GPU/GPUDofHandler.jl
@@ -0,0 +1,83 @@
+# This file defines the `GPUDofHandler` type, which is a degree of freedom handler that is stored on the GPU.
+# Therefore most of the functions are same as the ones defined in dof_handler.jl, but executable on the GPU.
+
+"""
+    AbstractGPUDofHandler <: Ferrite.AbstractDofHandler
+
+Abstract type representing degree-of-freedom (DoF) handlers for GPU-based
+finite element computations. This serves as the base type for GPU-specific
+DoF handler implementations.
+"""
+abstract type AbstractGPUDofHandler <: AbstractDofHandler end
+
+struct GPUSubDofHandler{VEC_INT, Ti, VEC_IP} <: AbstractGPUDofHandler
+    cellset::VEC_INT
+    field_names::VEC_INT # cannot use symbols in GPU
+    field_interpolations::VEC_IP
+    ndofs_per_cell::Ti
+end
+
+## IDEA: to have multiple interfaces for dofhandlers (e.g. one domain dofhandler, multiple subdomains)
+struct GPUDofHandler{SUB_DOFS <: AbstractArray{<:AbstractGPUDofHandler, 1}, CDOFS <: AbstractArray{<:Number, 1}, VEC_INT <: AbstractArray{Int32, 1}, GRID <: AbstractGrid} <: AbstractGPUDofHandler
+    subdofhandlers::SUB_DOFS
+    cell_dofs::CDOFS
+    grid::GRID
+    cell_dofs_offset::VEC_INT
+    cell_to_subdofhandler::VEC_INT
+end
+
+
+function ndofs_per_cell(dh::GPUDofHandler, cell::Ti) where {Ti <: Integer}
+    sdhidx = dh.cell_to_subdofhandler[cell]
+    sdhidx ∉ 1:length(dh.subdofhandlers) && return 0 # Dof handler is just defined on a subdomain
+    return ndofs_per_cell(dh.subdofhandlers[sdhidx])
+end
+ndofs_per_cell(sdh::GPUSubDofHandler) = sdh.ndofs_per_cell
+cell_dof_offset(dh::GPUDofHandler, i::Int32) = dh.cell_dofs_offset[i]
+get_grid(dh::GPUDofHandler) = dh.grid
+
+"""
+    celldofs(dh::GPUDofHandler, i::Int32)
+
+Return the cell degrees of freedom (DoFs) for the given cell index `i` in the
+`GPUDofHandler`.
+
+# Arguments
+- `dh`: A `GPUDofHandler` instance.
+- `i::Int32`: The index of the cell.
+
+# Returns
+A `SubArray` (view) representing the DoFs for the specified cell.
+"""
+function celldofs(dh::GPUDofHandler, i::Int32)
+    offset = cell_dof_offset(dh, i)
+    ndofs = ndofs_per_cell(dh, i)
+    view = @view dh.cell_dofs[offset:(offset + ndofs - Int32(1))]
+    return view
+end
+
+
+# TODO: Delete all below this line
+"""
+    LocalsGPUDofHandler{DH, LOCAL_MATRICES, LOCAL_VECTORS} <: AbstractGPUDofHandler
+
+This object acts as a temporary data structure for storing local stiffness matrices and force vectors, when
+dynamic shared memory doesn't have enough space, to be used in GPU kernel by GPU cell iterators.
+
+# Fields
+- `dh::DH`: Base DoF handler (e.g., `GPUDofHandler`).
+- `Kes::LOCAL_MATRICES`: Local stiffness matrices for each cell (3rd order tensor).
+- `fes::LOCAL_VECTORS`: Local force vectors for each cell (2nd order tensor).
+"""
+struct LocalsGPUDofHandler{DH <: AbstractDofHandler, LOCAL_MATRICES, LOCAL_VECTORS} <: AbstractGPUDofHandler
+    dh::DH
+    Kes::LOCAL_MATRICES
+    fes::LOCAL_VECTORS
+end
+
+# Accessor functions for LocalsGPUDofHandler
+dofhandler(dh::LocalsGPUDofHandler) = dh.dh
+localkes(dh::LocalsGPUDofHandler) = dh.Kes
+localfes(dh::LocalsGPUDofHandler) = dh.fes
+cellke(dh::LocalsGPUDofHandler, i::Int32) = @view dh.Kes[i, :, :]
+cellfe(dh::LocalsGPUDofHandler, i::Int32) = @view dh.fes[i, :]
diff --git a/src/GPU/KernelLauncher.jl b/src/GPU/KernelLauncher.jl
new file mode 100644
index 0000000000..578d8c574f
--- /dev/null
+++ b/src/GPU/KernelLauncher.jl
@@ -0,0 +1,56 @@
+#=
+This file defines the interface between the GPU backend (extension) and the Ferrite package.
+It provides abstract types, function signatures, and concrete types for managing GPU kernels
+and backends, serving as a foundation for GPU-accelerated computations.
+=#
+
+### Abstract Types ###
+abstract type AbstractBackend end
+abstract type AbstractKernel{BKD <: AbstractBackend} end
+
+
+### Functions ###
+
+"""
+    init_gpu_kernel(backend::AbstractGPUBackend, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
+
+Initializes a GPU kernel with the specified backend, number of cells, base functions,
+kernel function, and additional arguments.
+
+# Arguments
+- `backend::AbstractGPUBackend`: The GPU backend to use for kernel execution.
+- `n_cells::Ti`: Number of cells to be processed by the kernel.
+- `n_basefuncs::Ti`: Number of base functions for each cell.
+- `kernel::Function`: The kernel function to execute on the GPU.
+- `args::Tuple`: Additional arguments required by the kernel.
+
+# Notes
+This function needs to be implemented for each specific backend. Calling this function
+without a concrete implementation will raise an error.
+"""
+function init_kernel(backend::AbstractBackend, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
+    throw(ErrorException("A concrete implementation of init_gpu_kernel is required"))
+end
+
+"""
+    launch!(kernel::AbstractGPUKernel)
+
+Launches a GPU kernel using the specified backend. This interface provides a generic
+mechanism for running GPU-accelerated computations across different GPU backends.
+
+# Arguments
+- `::AbstractGPUKernel`: The GPU kernel to be launched.
+
+# Notes
+This function must be implemented for specific GPU kernels. If not implemented,
+an error will be thrown.
+"""
+function launch!(::AbstractKernel)
+    throw(ErrorException("A concrete implementation of launch! is required"))
+end
+
+(ker::AbstractKernel)() = launch!(ker)
+
+### GPU Backend ###
+struct BackendCUDA <: AbstractBackend end
+struct BackendCPU <: AbstractBackend end
diff --git a/src/GPU/coloring_dof.jl b/src/GPU/coloring_dof.jl
new file mode 100644
index 0000000000..122a9313e6
--- /dev/null
+++ b/src/GPU/coloring_dof.jl
@@ -0,0 +1,27 @@
+"""
+    ColoringDofHandler{Ti<:Integer,VECS<:Vector{Vector{Ti}},DH<:AbstractDofHandler}
+
+A mutable struct that encapsulates a DofHandler and different colors for the cells, in order to be used
+in CPU multithreading scheme.
+"""
+mutable struct ColoringDofHandler{Ti <: Integer, VECS <: Vector{Vector{Ti}}, DH <: AbstractDofHandler}
+    dh::DH
+    colors::VECS
+    current_color::Ti
+end
+
+
+function init_colordh(dh::AbstractDofHandler)
+    grid = get_grid(dh)
+    colors = create_coloring(grid)
+    return ColoringDofHandler(dh, colors, 0)
+end
+
+
+## Accessors ##
+dofhandler(cd::ColoringDofHandler) = cd.dh
+colors(cd::ColoringDofHandler) = cd.colors
+eles_in_color(cd::ColoringDofHandler, color::Ti) where {Ti <: Integer} = cd.colors[color]
+current_color(cd::ColoringDofHandler) = cd.current_color
+current_color!(cd::ColoringDofHandler, color::Ti) where {Ti <: Integer} = (cd.current_color = color)
+ncolors(cd::ColoringDofHandler) = cd |> colors |> length
diff --git a/src/GPU/gpu_grid.jl b/src/GPU/gpu_grid.jl
new file mode 100644
index 0000000000..5139864983
--- /dev/null
+++ b/src/GPU/gpu_grid.jl
@@ -0,0 +1,45 @@
+# This file defines the GPUGrid type, which is a grid that is stored on the GPU. Therefore most of the
+# functions are same as the ones defined in grid.jl, but executable on the GPU.
+
+abstract type AbstractGPUGrid{dim} <: AbstractGrid{dim} end
+
+struct GPUGrid{dim, CELLVEC <: AbstractArray, NODEVEC <: AbstractArray} <: AbstractGPUGrid{dim}
+    cells::CELLVEC
+    nodes::NODEVEC
+end
+
+function GPUGrid(
+        cells::CELLVEC,
+        nodes::NODEVEC
+    ) where {C <: AbstractCell, CELLVEC <: AbstractArray{C, 1}, NODEVEC <: AbstractArray{Node{dim, T}}} where {dim, T}
+    return GPUGrid{dim, CELLVEC, NODEVEC}(cells, nodes)
+end
+
+get_coordinate_type(::GPUGrid{dim, CELLVEC, NODEVEC}) where
+{C <: AbstractCell, CELLVEC <: AbstractArray{C, 1}, NODEVEC <: AbstractArray{Node{dim, T}}} where
+{dim, T} = Vec{dim, T} # Node is baked into the mesh type.
+
+
+# Note: For functions that takes blockIdx as an argument, we need to use Int32 explicitly,
+# otherwise the compiler will not be able to infer the type of the argument and throw a dynamic function invokation error.
+@inline getcells(grid::GPUGrid, v::Union{Int32, Vector{Int32}}) = grid.cells[v]
+@inline getnodes(grid::GPUGrid, v::Int32) = grid.nodes[v]
+
+"""
+    getcoordinates(grid::Ferrite.GPUGrid,e::Int32)
+
+Return the coordinates of the nodes of the element `e` in the `GPUGrid` `grid`.
+"""
+function getcoordinates(grid::GPUGrid, e::Int32)
+    # e is the element index.
+    CT = get_coordinate_type(grid)
+    cell = getcells(grid, e)
+    N = nnodes(cell)
+    x = MVector{N, CT}(undef) # local array to store the coordinates of the nodes of the cell.
+    node_ids = get_node_ids(cell)
+    for i in 1:length(x)
+        x[i] = get_node_coordinate(grid, node_ids[i])
+    end
+
+    return SVector(x...)
+end
diff --git a/src/GPU/mem_alloc.jl b/src/GPU/mem_alloc.jl
new file mode 100644
index 0000000000..52cd136c18
--- /dev/null
+++ b/src/GPU/mem_alloc.jl
@@ -0,0 +1,4 @@
+## This file manfists an interface for memory allocation on GPU that
+## is used by local stiffness matrix and force vector assembly.
+
+abstract type AbstractMemAlloc end
diff --git a/src/GPU/parallel_iterator.jl b/src/GPU/parallel_iterator.jl
new file mode 100644
index 0000000000..a2fe261b87
--- /dev/null
+++ b/src/GPU/parallel_iterator.jl
@@ -0,0 +1,153 @@
+# This files defines the abstract types and interfaces for GPU iterators.
+# The concrete implementations are defined in the extension.
+
+# abstract types and interfaces
+abstract type AbstractIterator end
+abstract type AbstractCellCache end
+
+abstract type AbstractKernelCellCache <: AbstractCellCache end
+abstract type AbstractKernelCellIterator <: AbstractIterator end
+
+@inline function cellke(::AbstractKernelCellCache)
+    throw(ArgumentError("cellke should be implemented in the derived type"))
+end
+
+@inline function cellfe(::AbstractKernelCellCache)
+    throw(ArgumentError("cellfe should be implemented in the derived type"))
+end
+
+
+## Concrete Implementation for CPU Multithreading ##
+
+
+##### CPUKernelCellCache #####
+mutable struct CPUKernelCellCache{G <: AbstractGrid, X, Tv <: Real} <: AbstractKernelCellCache
+    const flags::UpdateFlags
+    const grid::G
+    const dh::AbstractDofHandler
+    const coords::Vector{X}
+    const dofs::Vector{Int}
+    cellid::Int
+    const nodes::Vector{Int}
+    const ke::Matrix{Tv}
+    const fe::Vector{Tv}
+end
+
+
+function CellCache(dh::DofHandler{dim}, n_basefuncs::Int, flags::UpdateFlags = UpdateFlags()) where {dim}
+    ke = zeros(Float64, n_basefuncs, n_basefuncs)
+    fe = zeros(Float64, n_basefuncs)
+    grid = dh |> get_grid
+    N = nnodes_per_cell(grid, 1) # nodes and coords will be resized in `reinit!`
+    nodes = zeros(Int, N)
+    coords = zeros(Vec{dim, Float64}, N)
+    return CPUKernelCellCache(flags, grid, dh, coords, Int[], -1, nodes, ke, fe)
+end
+
+## TODO: to be removed (kept for reference)
+# function _makecache(iterator::CPUKernelCellIterator, e::Ti) where {Ti <: Integer}
+#     dh = iterator.dh |> dofhandler
+#     grid = iterator.grid
+#     cellid = e
+#     cell = getcells(grid, e)
+
+#     # Extract the node IDs of the cell.
+#     nodes = SVector(get_node_ids(cell)...)
+
+#     # Extract the degrees of freedom for the cell.
+#     dofs = celldofs(dh, e)
+#     # Get the coordinates of the nodes of the cell.
+#     CT = get_coordinate_type(grid)
+#     N = nnodes(cell)
+#     x = MVector{N, CT}(undef)
+#     for i in eachindex(x)
+#         x[i] = get_node_coordinate(grid, nodes[i])
+#     end
+#     coords = SVector(x...)
+
+#     # Return the GPUCellCache containing the cell's data.
+#     return CPUKernelCellCache(coords, dofs, cellid, nodes, iterator.ke, iterator.fe)
+# end
+
+function _reinit!(cc::CPUKernelCellCache, i::Int)
+    cc.cellid = i
+    fill!(cc.ke, zero(eltype(cc.ke)))
+    fill!(cc.fe, zero(eltype(cc.fe)))
+    if cc.flags.nodes
+        resize!(cc.nodes, nnodes_per_cell(cc.grid, i))
+        cellnodes!(cc.nodes, cc.grid, i)
+    end
+    if cc.flags.coords
+        resize!(cc.coords, nnodes_per_cell(cc.grid, i))
+        getcoordinates!(cc.coords, cc.grid, i)
+    end
+    if cc.flags.dofs
+        resize!(cc.dofs, ndofs_per_cell(cc.dh, i))
+        celldofs!(cc.dofs, cc.dh, i)
+    end
+    return cc
+end
+
+## Accessors ##
+getnodes(cc::CPUKernelCellCache) = cc.nodes
+
+
+getcoordinates(cc::CPUKernelCellCache) = cc.coords
+
+
+celldofs(cc::CPUKernelCellCache) = cc.dofs
+
+
+cellid(cc::CPUKernelCellCache) = cc.cellid
+
+
+cellke(cc::CPUKernelCellCache) = cc.ke
+
+cellfe(cc::CPUKernelCellCache) = cc.fe
+
+
+##### CPUKernelCellIterator #####
+struct CPUKernelCellIterator{CC <: CPUKernelCellCache, DH <: ColoringDofHandler} <: AbstractKernelCellIterator
+    cache::CC
+    dh::DH
+    n_cells::Int
+    thread_id::Int # thread id that the iterator is working on
+end
+
+
+function CellIterator(dh::ColoringDofHandler, n_basefuncs::Int)
+    grid = dh |> dofhandler |> get_grid
+    n_cells = grid |> getncells
+    cache = CellCache(dh |> dofhandler, n_basefuncs)
+    local_thread_id = Threads.threadid()
+    return CPUKernelCellIterator(cache, dh, n_cells, local_thread_id)
+end
+
+
+ncells(iterator::CPUKernelCellIterator) = iterator.n_cells
+_cache(iterator::CPUKernelCellIterator) = iterator.cache
+
+
+function Base.iterate(iterator::CPUKernelCellIterator)
+    i = iterator.thread_id
+    curr_color = iterator.dh |> current_color # current color that's being processed
+    eles_color = eles_in_color(iterator.dh, curr_color) # elements in the current color
+    ncells = length(eles_color)
+    i <= ncells || return nothing
+    cache = _cache(iterator)
+    _reinit!(cache, eles_color[i])
+    return (cache, i)
+end
+
+
+function Base.iterate(iterator::CPUKernelCellIterator, state)
+    stride = Threads.nthreads()
+    i = state + stride # next strided element id
+    curr_color = iterator.dh |> current_color # current color that's being processed
+    eles_color = eles_in_color(iterator.dh, curr_color) # elements in the current color
+    ncells = length(eles_color)
+    i <= ncells || return nothing
+    cache = _cache(iterator)
+    _reinit!(cache, eles_color[i])
+    return (cache, i)
+end
diff --git a/src/exports.jl b/src/exports.jl
index c710861c15..07a3c684d4 100644
--- a/src/exports.jl
+++ b/src/exports.jl
@@ -104,6 +104,7 @@ export
     addcellset!,
     transform_coordinates!,
     generate_grid,
+    GPUGrid,
 
     # Grid coloring
     create_coloring,
@@ -122,6 +123,7 @@ export
     DofOrder,
     evaluate_at_grid_nodes,
     apply_analytical!,
+    get_grid,
 
     # Sparsity pattern
     # AbstractSparsityPattern,
@@ -184,4 +186,25 @@ export
     evaluate_at_points,
     PointIterator,
     PointLocation,
-    PointValues
+    PointValues,
+
+    # GPU
+    LazyKernel,
+    StaticQuadratureView,
+    StaticInterpolationValues,
+    init_kernel,
+    launch!,
+    cellke,
+    cellfe,
+    GPUSubDofHandler,
+    GPUDofHandler,
+    GPUGrid,
+    getweights,
+    getbackend,
+    BackendCUDA,
+    BackendCPU,
+    LocalsGPUDofHandler,
+    dofhandler,
+    localfes,
+    localkes,
+    AbstractMemAlloc
diff --git a/test/GPU/runtests.jl b/test/GPU/runtests.jl
index 7d317327a5..3f92322ffd 100644
--- a/test/GPU/runtests.jl
+++ b/test/GPU/runtests.jl
@@ -1,5 +1,12 @@
-using CUDA
 using Ferrite
+using CUDA
 using Test
+using SparseArrays
+
+include("test_utils.jl")
 
-@test CUDA.functional()
+# Unit tests
+include("test_assemble.jl")
+include("test_iterator.jl")
+include("test_kernellauncher.jl")
+include("test_adapt.jl")
diff --git a/test/GPU/test_adapt.jl b/test/GPU/test_adapt.jl
new file mode 100644
index 0000000000..e8bc9cea6e
--- /dev/null
+++ b/test/GPU/test_adapt.jl
@@ -0,0 +1,66 @@
+function dofs_cpu(dh, cv)
+    nbasefuncs = cv |> getnbasefunctions
+    ncells = dh |> get_grid |> getncells
+    dofs = zeros(Int32, nbasefuncs, ncells)
+    for i in 1:ncells
+        cdofs = celldofs(dh, i)
+        dofs[:, i] .= cdofs
+    end
+    return dofs
+end
+
+
+function dofs_gpu_kernel(dofs, dh, cv; mem_alloc)
+    nbasefuncs = cv |> getnbasefunctions
+    for cell in CellIterator(dh, mem_alloc)
+        cdofs = celldofs(cell)
+        dofs[:, cellid(cell)] .= cdofs
+    end
+    return nothing
+end
+
+weights_cpu(cv) = cv.qr |> getweights
+
+function weights_gpu_kernel(weights, cv)
+    nweights = length(weights)
+    for i in 1:nweights
+        weights[i] = cv.weights[i]
+    end
+    return
+end
+
+function nodes_cpu(grid)
+    nodes = grid.cells .|> (x -> x.nodes |> collect)
+    return hcat(nodes...)
+end
+
+function nodes_gpu_kernel(nodes, dh, cv; mem_alloc)
+    nbasefuncs = cv |> getnbasefunctions
+    for cell in CellIterator(dh, mem_alloc)
+        cnodes = getnodes(cell)
+        nodes[:, cellid(cell)] .= cnodes
+    end
+    return nothing
+end
+
+@testset "Adapt" begin
+    dh, cv = generate_Bilinear_problem()
+    cpudofs = dofs_cpu(dh, cv) |> cu
+    ncells = dh |> get_grid |> getncells
+    nbasefunctions = cv |> getnbasefunctions
+    gpudofs = zeros(Int32, nbasefunctions, ncells) |> cu
+    init_kernel(BackendCUDA, ncells, nbasefunctions, dofs_gpu_kernel, (gpudofs, dh, cv)) |> launch!
+    ## Test that dofs are correctly transfered to the GPU
+    @test all(cpudofs .== gpudofs)
+    ## Test that weights are correctly transfered to the GPU
+    cpuweights = weights_cpu(cv) |> cu
+    gpuweights = zeros(Float32, length(cpuweights)) |> cu
+    @cuda blocks = 1 threads = 1 weights_gpu_kernel(gpuweights, cv)
+    @test all(cpuweights .== gpuweights)
+    ## Test that nodes are correctly transfered to the GPU
+    cpunodes = nodes_cpu(dh |> get_grid) |> cu
+    n_nodes = length(cpunodes)
+    gpu_cellnodes = CUDA.zeros(Int32, nbasefunctions, ncells)
+    init_kernel(BackendCUDA, ncells, nbasefunctions, nodes_gpu_kernel, (gpu_cellnodes, dh, cv)) |> launch!
+    @test all(cpunodes .== gpu_cellnodes)
+end
diff --git a/test/GPU/test_assemble.jl b/test/GPU/test_assemble.jl
new file mode 100644
index 0000000000..d49211622a
--- /dev/null
+++ b/test/GPU/test_assemble.jl
@@ -0,0 +1,43 @@
+# Helper function to initialize sparse matrices
+function init_sparse_matrix(n, m)
+    row_indices = Int32[]
+    col_indices = Int32[]
+    values = Float32[]
+    for i in 1:min(n, m)
+        push!(row_indices, i)
+        push!(col_indices, i)
+        push!(values, Float32(0.0))
+    end
+    sparse_matrix = sparse(row_indices, col_indices, values, n, m)
+    return sparse_matrix
+end
+
+
+function assemble_kernel!(K, f, dofs, Ke, fe)
+    # kernel that only assembles local into global.
+    A = start_assemble(K, f)
+    return assemble!(A, dofs, Ke, fe)
+end
+
+# Test for assembling global stiffness matrix and force vector
+@testset "Test assemble!" begin
+    # System parameters
+    n = 5
+    m = 5
+    dofs = Int32[1, 2, 3, 4, 5]
+    Ke = CUDA.fill(1.0f0, n, m)  # Local stiffness matrix (shared memory)
+    fe = CUDA.fill(1.0f0, n)     # Local force vector (shared memory)
+
+    # Initialize global stiffness matrix and global force vector
+    K = CUSPARSE.CuSparseMatrixCSC(init_sparse_matrix(n, m))
+
+    f = CUDA.fill(0.0f0, n)
+
+    @cuda blocks = 5 threads = 5 assemble_kernel!(K, f, cu(dofs), Ke, fe) # 5 * 5 = 25 threads
+
+    # Test: Check force vector update
+    @test all(f .≈ CUDA.fill(25.0f0, n))
+    # Test: Check global stiffness matrix update (values must be incremented by 25 = 5 * 5)
+    @test all(K.nzVal .≈ CUDA.fill(25.0f0, length(K.nzVal)))
+
+end
diff --git a/test/GPU/test_iterator.jl b/test/GPU/test_iterator.jl
new file mode 100644
index 0000000000..3a0b4f8e19
--- /dev/null
+++ b/test/GPU/test_iterator.jl
@@ -0,0 +1,131 @@
+function getalldofs(dh)
+    ncells = dh |> get_grid |> getncells
+    return map(i -> celldofs(dh, i) .|> Int32, 1:ncells) |> (x -> hcat(x...)) |> cu
+end
+
+
+function dof_kernel_kernel!(dofs, dh, n_basefuncs; mem_alloc)
+    # this kernel is used to get all the dofs of the grid, which then
+    # can be validated against the correct dofs (i.e. CPU version).
+    for cell in CellIterator(dh, mem_alloc)
+        cid = cellid(cell)
+        cdofs = celldofs(cell)
+        for i in 1:n_basefuncs
+            dofs[Int32(i), cid] = cdofs[Int32(i)]
+        end
+    end
+    return nothing
+end
+
+
+function assemble_element_gpu!(Ke, fe, cv, cell)
+    n_basefuncs = getnbasefunctions(cv)
+    for qv in Ferrite.QuadratureValuesIterator(cv, getcoordinates(cell))
+        ## Get the quadrature weight
+        dΩ = getdetJdV(qv)
+        ## Loop over test shape functions
+        for i in 1:n_basefuncs
+            δu = shape_value(qv, i)
+            ∇u = shape_gradient(qv, i)
+            ## Add contribution to fe
+            fe[i] += δu * dΩ
+            #fe_shared[tx,i] += δu * dΩ
+            ## Loop over trial shape functions
+            for j in 1:n_basefuncs
+                ∇δu = shape_gradient(qv, j)
+                ## Add contribution to Ke
+                Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
+            end
+        end
+    end
+    return
+end
+
+
+function assemble_element_cpu!(Ke, fe, cellvalues)
+    n_basefuncs = getnbasefunctions(cellvalues)
+    # Loop over quadrature points
+    for q_point in 1:getnquadpoints(cellvalues)
+        # Get the quadrature weight
+        dΩ = getdetJdV(cellvalues, q_point)
+        # Loop over test shape functions
+        for i in 1:n_basefuncs
+            δu = shape_value(cellvalues, q_point, i)
+            ∇δu = shape_gradient(cellvalues, q_point, i)
+            # Add contribution to fe
+            fe[i] += δu * dΩ
+            # Loop over trial shape functions
+            for j in 1:n_basefuncs
+                ∇u = shape_gradient(cellvalues, q_point, j)
+                # Add contribution to Ke
+                Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
+            end
+        end
+    end
+    return Ke, fe
+end
+
+function localkefe_kernel!(kes, fes, cv, dh; mem_alloc)
+    nbasefuncs = getnbasefunctions(cv)
+    for cell in CellIterator(dh, mem_alloc)
+        Ke = cellke(cell)
+        fe = cellfe(cell)
+        assemble_element_gpu!(Ke, fe, cv, cell)
+        kes[cellid(cell), :, :] .= Ke
+        fes[cellid(cell), :] .= fe
+    end
+    return nothing
+end
+
+function get_cpu_kefe(dh, cellvalues)
+    ncells = dh |> get_grid |> getncells
+    n_basefuncs = getnbasefunctions(cellvalues)
+    kes = zeros(Float32, ncells, n_basefuncs, n_basefuncs)
+    fes = zeros(Float32, ncells, n_basefuncs)
+    for cell in CellIterator(dh)
+        cid = cellid(cell)
+        reinit!(cellvalues, cell)
+        # Compute element contribution
+        assemble_element_cpu!((@view kes[cid, :, :]), (@view fes[cid, :, :]), cellvalues)
+    end
+    return kes |> cu, fes |> cu
+end
+
+@testset "Test shared memory iterators" begin
+    dh, cellvalues = generate_Bilinear_problem()
+    n_basefuncs = getnbasefunctions(cellvalues)
+    # 1 Test that dofs for each cell in the grid are correctly computed
+    ncells = dh |> get_grid |> getncells
+    dofs = CUDA.fill(Int32(0), n_basefuncs, ncells)
+    correct_dofs = getalldofs(dh)
+    init_kernel(BackendCUDA, ncells, n_basefuncs, dof_kernel_kernel!, (dofs, dh, n_basefuncs)) |> launch!
+    @test all(dofs .≈ correct_dofs)
+
+    # 2. Test that local ke and fe are correctly computed
+    kes_gpu = CUDA.fill(0.0f0, ncells, n_basefuncs, n_basefuncs)
+    fes_gpu = CUDA.fill(0.0f0, ncells, n_basefuncs)
+    init_kernel(BackendCUDA, ncells, n_basefuncs, localkefe_kernel!, (kes_gpu, fes_gpu, cellvalues, dh)) |> launch!
+    kes_cpu, fes_cpu = get_cpu_kefe(dh, cellvalues)
+    @test all(abs.(kes_gpu .- kes_cpu) .< 1.0e-3) #TODO: This needs further investigation
+    @test all(fes_gpu .≈ fes_cpu)
+end
+
+
+@testset "Test global memory iterators" begin
+    dh, cellvalues = generate_Biquadratic_problem()
+    n_basefuncs = getnbasefunctions(cellvalues)
+    # 1 Test that dofs for each cell in the grid are correctly computed
+    ncells = dh |> get_grid |> getncells
+    dofs = CUDA.fill(Int32(0), n_basefuncs, ncells)
+    correct_dofs = getalldofs(dh)
+    init_kernel(BackendCUDA, ncells, n_basefuncs, dof_kernel_kernel!, (dofs, dh, n_basefuncs)) |> launch!
+    @test all(dofs .≈ correct_dofs)
+
+    # 2. Test that local ke and fe are correctly computed
+    kes_gpu = CUDA.fill(0.0f0, ncells, n_basefuncs, n_basefuncs)
+    fes_gpu = CUDA.fill(0.0f0, ncells, n_basefuncs)
+    init_kernel(BackendCUDA, ncells, n_basefuncs, localkefe_kernel!, (kes_gpu, fes_gpu, cellvalues, dh)) |> launch!
+    kes_cpu, fes_cpu = get_cpu_kefe(dh, cellvalues)
+    @test all(abs.(kes_gpu .- kes_cpu) .< 1.0e-1) #TODO: This needs further investigation
+    @test all(fes_gpu .≈ fes_cpu)
+end
diff --git a/test/GPU/test_kernellauncher.jl b/test/GPU/test_kernellauncher.jl
new file mode 100644
index 0000000000..64e9294d0e
--- /dev/null
+++ b/test/GPU/test_kernellauncher.jl
@@ -0,0 +1,33 @@
+# Sample CUDA Kernel (adding two vectors)
+function kernel_add(A, B, C, n; mem_alloc)
+    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+    if i <= n
+        C[i] = A[i] + B[i]
+    end
+    return
+end
+
+# Helper function to launch the kernel with CUDAKernelLauncher
+function test_launch_kernel!(n_cells::Integer, n_basefuncs::Integer, args...)
+    return init_kernel(BackendCUDA, n_cells, n_basefuncs, kernel_add, args) |> launch!
+end
+
+# Testing for different integer types
+@testset "Testing CUDAKernelLauncher with different integer types" begin
+    # Arrays for testing
+    N = 10
+    A = CUDA.fill(1.0f0, N)
+    B = CUDA.fill(2.0f0, N)
+    C = CUDA.fill(0.0f0, N)
+
+    # Test with Int32
+    test_launch_kernel!(Int32(N), Int32(2), A, B, C, N)
+    CUDA.synchronize()
+    @test all(Array(C) .== 3.0f0)
+
+    # Test with Int64
+    fill!(C, 0.0f0)  # reset C array
+    test_launch_kernel!(Int64(N), Int64(2), A, B, C, N)
+    CUDA.synchronize()
+    @test all(Array(C) .== 3.0f0)
+end
diff --git a/test/GPU/test_utils.jl b/test/GPU/test_utils.jl
new file mode 100644
index 0000000000..35d1ac5307
--- /dev/null
+++ b/test/GPU/test_utils.jl
@@ -0,0 +1,47 @@
+function generate_Bilinear_problem()
+    left = Tensor{1, 2, Float32}((0, -0))
+
+    right = Tensor{1, 2, Float32}((rand(10.0:100000.0), rand(10.0:100000.0)))
+
+    grid_dims = (rand(1:1000), rand(1:1000))
+
+    grid = generate_grid(Quadrilateral, grid_dims, left, right)
+
+    ip = Lagrange{RefQuadrilateral, 1}() # define the interpolation function (i.e. Bilinear lagrange)
+
+    qr = QuadratureRule{RefQuadrilateral}(Float32, 2)
+
+    cellvalues = CellValues(Float32, qr, ip)
+
+    dh = DofHandler(grid)
+
+    add!(dh, :u, ip)
+
+    close!(dh)
+
+    return dh, cellvalues
+end
+
+function generate_Biquadratic_problem()
+    left = Tensor{1, 2, Float32}((0, -0))
+
+    right = Tensor{1, 2, Float32}((rand(10.0:100000.0), rand(10.0:100000.0)))
+
+    grid_dims = (rand(100:1000), rand(100:1000)) # to make sure the problem is big enough to use `CUDAGlobalCellIterator`
+
+    grid = generate_grid(Quadrilateral, grid_dims, left, right)
+
+    ip = Lagrange{RefQuadrilateral, 2}() # define the interpolation function (i.e. Biquadratic lagrange)
+
+    qr = QuadratureRule{RefQuadrilateral}(Float32, 3) # 3x3 quadrature rule
+
+    cellvalues = CellValues(Float32, qr, ip)
+
+    dh = DofHandler(grid)
+
+    add!(dh, :u, ip)
+
+    close!(dh)
+
+    return dh, cellvalues
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 57cd82d8a7..3eb60d29ac 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -42,7 +42,7 @@ include("blockarrays.jl")
 include("test_examples.jl")
 
 @test all(x -> isdefined(Ferrite, x), names(Ferrite))  # Test that all exported symbols are defined
-# # See which is not defined if fails
+# See which is not defined if fails
 # for name in names(Ferrite)
 #     isdefined(Ferrite, name) || @warn "Ferrite.$name is not defined but $name is exported"
 # end