diff --git a/.bazelversion b/.bazelversion
index 21c8c7b46b..b26a34e470 100644
--- a/.bazelversion
+++ b/.bazelversion
@@ -1 +1 @@
-7.1.1
+7.2.1
diff --git a/.github/ISSUE_TEMPLATE/1_question.md b/.github/ISSUE_TEMPLATE/1_question.md
new file mode 100644
index 0000000000..bee63c246b
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/1_question.md
@@ -0,0 +1,7 @@
+---
+name: Ask a question
+about: Use this template for any questions
+title: ''
+labels: 'question'
+assignees: ''
+---
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/2_bug_report.md b/.github/ISSUE_TEMPLATE/2_bug_report.md
new file mode 100644
index 0000000000..4e135815ca
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/2_bug_report.md
@@ -0,0 +1,32 @@
+---
+name: Report a bug or a performance issue
+about: Use this template to report unexpected behavior
+title: ''
+labels: 'bug'
+assignees: ''
+---
+
+# Summary
+Provide a short summary of the issue. 
+See the sections below
+for factors important for the reproduction of an issue.
+
+# Version
+Report oneTBB version used to reproduce the problem.
+
+# Environment
+Provide any environmental details that you consider significant for reproducing the issue.
+The following information is important:
+* Hardware
+* OS name and version
+* Compiler version
+
+# Observed Behavior
+Document behavior you observe.
+
+# Expected Behavior
+Document behavior you expect.
+
+# Steps To Reproduce
+Check that the issue is reproducible with the latest revision
+of the master branch. Include all the steps to reproduce the issue.
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/3_feature_request.md b/.github/ISSUE_TEMPLATE/3_feature_request.md
new file mode 100644
index 0000000000..c4f8cfcbb3
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/3_feature_request.md
@@ -0,0 +1,19 @@
+---
+name: Request a feature
+about: Use this template to request new functionality or change the behavior of the library
+title: ''
+labels: 'new feature'
+assignees: ''
+---
+
+# Summary
+Include a short summary of the request. 
+
+See the sections below
+for factors important for a feature request.
+
+# Problem Statement
+Describe the problem you want to solve with a reasonable level of detail.
+
+# Preferred Solution
+Provide your ideas regarding problem solutions.
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/4_documentation.md b/.github/ISSUE_TEMPLATE/4_documentation.md
new file mode 100644
index 0000000000..3788d13b89
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/4_documentation.md
@@ -0,0 +1,20 @@
+---
+name: Request a documentation change
+about: Use this template to report documentation issue or request documentation changes
+title: ''
+labels: 'documentation'
+assignees: ''
+---
+
+# Summary
+Include a short summary of the issue or request. 
+See the sections below
+for factors important for a documentation
+issue.
+
+# URLs
+Include pointers to documents that are impacted.
+
+# Additional Details
+Provide a detailed description of the expected changes in documentation
+and suggestions you have.
\ No newline at end of file
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index caf80fff86..f986d31a40 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -4,8 +4,6 @@ _Add a comprehensive description of proposed changes_
 
 Fixes # - _issue number(s) if exists_
 
-- [ ] - git commit message contains an appropriate signed-off-by string _(see [CONTRIBUTING.md](https://github.com/oneapi-src/oneTBB/blob/master/CONTRIBUTING.md#pull-requests) for details)_
-
 ### Type of change
 
 _Choose one or multiple, leave empty if none of the other choices apply_
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a65de62241..7dbf3c407d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -37,7 +37,7 @@ jobs:
     runs-on: [ubuntu-20.04]
     timeout-minutes: 10
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run scan
         run: |
           sudo apt update && sudo apt install -y codespell
@@ -47,7 +47,7 @@ jobs:
     runs-on: [ubuntu-20.04]
     timeout-minutes: 10
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run scan
         run: |
           command -v clang-format-10
@@ -62,7 +62,7 @@ jobs:
     runs-on: [ubuntu-22.04]
     timeout-minutes: 10
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Install prerequisites
         run: |
           pip3 install -U Jinja2
@@ -90,7 +90,7 @@ jobs:
     needs: [documentation]
     steps:
       - name: Checkout gh-pages
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
         with:
           ref: gh-pages
           path: gh-pages
@@ -117,7 +117,7 @@ jobs:
     if: ${{ github.ref != 'refs/heads/master' }}
     runs-on: [ubuntu-20.04]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - name: Run check
@@ -137,7 +137,7 @@ jobs:
     runs-on: [ubuntu-latest]
     timeout-minutes: 15
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         run: |
           mkdir build && cd build
@@ -179,7 +179,7 @@ jobs:
             preview: 'ON'
             cmake_static: -DBUILD_SHARED_LIBS=OFF
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         shell: bash
         run: |
@@ -212,7 +212,7 @@ jobs:
             preview: 'ON'
             cmake_static: -DBUILD_SHARED_LIBS=OFF
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         shell: bash
         run: |
@@ -257,7 +257,7 @@ jobs:
             preview: 'OFF'
             job_name: windows_cl2022_cxx17_relwithdebinfo_preview=OFF
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         run: |
           mkdir build
@@ -295,7 +295,7 @@ jobs:
             build_type: debug
             preview: 'ON'
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         shell: bash
         run: |
@@ -321,7 +321,7 @@ jobs:
             build_type: relwithdebinfo
             preview: 'ON'
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         shell: bash
         run: |
@@ -357,7 +357,7 @@ jobs:
             preview: 'OFF'
             job_name: examples_windows_cl2022_cxx17_relwithdebinfo_preview=OFF
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         run: |
           mkdir build
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 0000000000..7a80c5f0e2
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,86 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+  schedule:
+    - cron: '0 0 * * 1'
+
+permissions:
+  contents: read
+  
+jobs:
+  analyze:
+    name: Analyze (${{ matrix.language }})
+    runs-on: ubuntu-latest
+    # timeout-minutes: 
+    permissions:
+      # required for all workflows
+      security-events: write
+      # required to fetch internal or private CodeQL packs
+      packages: read
+      # only required for workflows in private repositories
+      actions: read
+      contents: read
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: ["cpp", "python"]
+    
+    steps:
+      - name: Harden Runner
+        uses: step-security/harden-runner@v2.6.1
+        with:
+          egress-policy: audit
+          
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+    # Initializes the CodeQL tools for scanning.
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v3
+        with:
+          languages: ${{ matrix.language }}
+ 
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+    # If this step fails, then you should remove it and run the build manually (see below)
+      - name: Autobuild
+        uses: github/codeql-action/autobuild@v3.24.10
+    
+    # If the analyze step fails for one of the languages you are analyzing with
+    # "We were unable to automatically build your code", modify the matrix above
+    # to set the build mode to "manual" for that language. Then modify this step
+    # to build your code.
+    # Command-line programs to run using the OS shell.
+    # See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+    #- if: matrix.build-mode == 'manual'
+    #  shell: bash
+    #  run: |
+    #    echo 'If you are using a "manual" build mode for one or more of the' \
+    #      'languages you are analyzing, replace this with the commands to build' \
+    #      'your code, for example:'
+    #    echo '  make bootstrap'
+    #    echo '  make release'
+    #    exit 1
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v3
+        with:
+          category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/ossf-scorecard.yml b/.github/workflows/ossf-scorecard.yml
new file mode 100644
index 0000000000..9f45569f8a
--- /dev/null
+++ b/.github/workflows/ossf-scorecard.yml
@@ -0,0 +1,83 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: OSSF Scorecard
+on:
+  # For Branch-Protection check. Only the default branch is supported. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
+  branch_protection_rule:
+  # To guarantee Maintained check is occasionally updated. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
+  schedule:
+    - cron: '00 02 * * *'
+  push:
+    branches: [ "master" ]
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analysis:
+    name: Scorecard analysis
+    runs-on: ubuntu-latest
+    permissions:
+      # Needed to upload the results to code-scanning dashboard.
+      security-events: write
+      # Needed to publish results and get a badge (see publish_results below).
+      id-token: write
+      # Uncomment the permissions below if installing in a private repository.
+      # contents: read
+      # actions: read
+
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@v4.1.1
+        with:
+          persist-credentials: false
+
+      - name: "Run analysis"
+        uses: ossf/scorecard-action@v2.3.1
+        with:
+          results_file: results.sarif
+          results_format: sarif
+          # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
+          # - you want to enable the Branch-Protection check on a *public* repository, or
+          # - you are installing Scorecard on a *private* repository
+          # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional.
+          # repo_token: ${{ secrets.SCORECARD_TOKEN }}
+
+          # Public repositories:
+          #   - Publish results to OpenSSF REST API for easy access by consumers
+          #   - Allows the repository to include the Scorecard badge.
+          #   - See https://github.com/ossf/scorecard-action#publishing-results.
+          # For private repositories:
+          #   - `publish_results` will always be set to `false`, regardless
+          #     of the value entered here.
+          publish_results: true
+
+      # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
+      # format to the repository Actions tab.
+      #- name: "Upload artifact"
+      #  uses: actions/upload-artifact@97a0fba1372883ab732affbe8f94b823f91727db # v3.pre.node20
+      #  with:
+      #    name: SARIF file
+      #    path: results.sarif
+      #    retention-days: 5
+
+      # Upload the results to GitHub's code scanning dashboard (optional).
+      # Commenting out will disable upload of results to your repo's Code Scanning dashboard
+      #- name: "Upload to code-scanning"
+      #  uses: github/codeql-action/upload-sarif@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
+      #  with:
+      #    sarif_file: results.sarif
diff --git a/BUILD.bazel b/BUILD.bazel
index 34f98eba10..9073f4640d 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -117,6 +117,47 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "test_mutex",
+    srcs = [
+        "test/tbb/test_mutex.cpp",
+        "test/tbb/test_mutex.h"
+    ] + glob([
+        "test/common/*.h",
+    ]),
+    includes = ["test"],
+    deps = [
+        ":tbb",
+    ],
+)
+
+cc_test(
+    name = "test_parallel_for",
+    srcs = [
+        "test/tbb/test_parallel_for.cpp",
+        "test/tbb/test_partitioner.h"
+    ] + glob([
+        "test/common/*.h",
+    ]),
+    includes = ["test"],
+    deps = [
+        ":tbb",
+    ],
+)
+
+cc_test(
+    name = "test_parallel_reduce",
+    srcs = [
+        "test/tbb/test_parallel_reduce.cpp",
+    ] + glob([
+        "test/common/*.h",
+    ]),
+    includes = ["test"],
+    deps = [
+        ":tbb",
+    ],
+)
+
 cc_test(
     name = "test_task",
     srcs = [
diff --git a/Bazel.md b/Bazel.md
index 996a3b2eb5..09a630a72b 100644
--- a/Bazel.md
+++ b/Bazel.md
@@ -19,6 +19,8 @@ The standard Bazel approach to handling third-party libraries is static linking.
 
 ## Using oneTBB as a dependency
 
+### Traditional WORKSPACE approach
+
 This example demonstrates how to use oneTBB as a dependency within a Bazel project.
 
 The following file structure is assumed:
@@ -78,6 +80,16 @@ The expected output of this program is the current version of oneTBB.
 
 Switch to the folder with the files created earlier and run the binary with `bazel run //:Demo`.
 
+### Bzlmod
+
+If you use Bzlmod, you can fetch oneTBB with the [Bazel Central Registry](https://registry.bazel.build/).
+
+Add the following line to your `MODULE.bazel` file:
+
+```bazel
+bazel_dep(name = "onetbb", version = "2021.13.0")
+```
+
 ## Build oneTBB using Bazel
 
 Run ```bazel build //...``` in the oneTBB root directory.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 19232a9920..811a3a5549 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,7 +49,7 @@ string(REGEX REPLACE ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1" _tbb_ver_min
 string(REGEX REPLACE ".*#define TBB_VERSION_PATCH ([0-9]+).*" "\\1" _tbb_ver_patch "${_tbb_version_info}")
 string(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1" TBB_INTERFACE_VERSION "${_tbb_version_info}")
 string(REGEX REPLACE ".*#define __TBB_BINARY_VERSION ([0-9]+).*" "\\1" TBB_BINARY_VERSION "${_tbb_version_info}")
-set(TBB_BINARY_MINOR_VERSION ${_tbb_ver_minor})
+string(REGEX REPLACE "..(..)." "\\1" TBB_BINARY_MINOR_VERSION "${TBB_INTERFACE_VERSION}")
 set(TBBMALLOC_BINARY_VERSION 2)
 set(TBBBIND_BINARY_VERSION 3)
 
@@ -107,6 +107,9 @@ option(TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH "Disable HWLOC automatic search by pkg
 option(TBB_ENABLE_IPO "Enable Interprocedural Optimization (IPO) during the compilation" ON)
 option(TBB_FUZZ_TESTING "Enable fuzz testing" OFF)
 option(TBB_INSTALL "Enable installation" ON)
+if(LINUX)
+option(TBB_LINUX_SEPARATE_DBG "Enable separation of the debug symbols during the build" OFF)
+endif()
 if(APPLE)
 option(TBB_BUILD_APPLE_FRAMEWORKS "Build as Apple Frameworks" OFF)
 endif()
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 0000000000..78105ac7e8
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,27 @@
+# Where component owners are known, add them here.
+
+/oneTBB/src/tbb/ @pavelkumbrasev
+/oneTBB/src/tbb/ @dnmokhov
+/oneTBB/src/tbb/ @JhaShweta1
+/oneTBB/src/tbb/ @sarathnandu
+/oneTBB/include/oneapi/tbb/parallel_* @pavelkumbrasev
+/oneTBB/include/oneapi/tbb/concurrent_* @kboyarinov
+/oneTBB/include/oneapi/tbb/flow_graph* @kboyarinov
+/oneTBB/include/oneapi/tbb/flow_graph* @aleksei-fedotov
+/oneTBB/include/oneapi/tbb/detail/_flow_graph* @kboyarinov
+/oneTBB/include/oneapi/tbb/detail/_flow_graph* @aleksei-fedotov
+/oneTBB/include/oneapi/tbb/detail/_concurrent* @kboyarinov
+/oneTBB/src/doc @aepanchi
+/oneTBB/src/tbbbind/ @isaevil
+/oneTBB/src/tbbmalloc/ @lplewa
+/oneTBB/src/tbbmalloc_proxy/ @lplewa
+/oneTBB/cmake/ @isaevil
+/oneTBB/*CMakeLists.txt @isaevil
+/oneTBB/python/ @sarathnandu
+/oneTBB/python/ @isaevil
+
+# Bazel build related files.
+/oneTBB/.bazelversion @Vertexwahn
+/oneTBB/Bazel.md @Vertexwahn
+/oneTBB/BUILD.bazel @Vertexwahn
+/oneTBB/MODULE.bazel @Vertexwahn
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000..c169707396
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,134 @@
+
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official email address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+oneTBBCodeOfConduct At intel DOT com.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
+
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3048b21199..b2b6a968cd 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -21,9 +21,7 @@ As an open source project, we welcome community contributions to oneAPI Threadin
 
 Licensing is very important to open source projects. It helps ensure the software continues to be available under the terms that the author desired. The oneTBB project uses the [Apache 2.0 License](https://github.com/oneapi-src/oneTBB/blob/master/LICENSE.txt), a permissive open source license that allows you to freely use, modify, and distribute your own products that include Apache 2.0 licensed software. By contributing to the oneTBB project, you agree to the license and copyright terms therein and release your own contributions under these terms. 
 
-Some imported or reused components within oneTBB use other licenses, as described in [third-party-programs.txt](https://github.com/oneapi-src/oneTBB/blob/master/third-party-programs.txt). By carefully reviewing potential contributions and enforcing a [Developer Certification of Origin (DCO)](https://developercertificate.org/) for contributed code, we can ensure that the community can develop products with oneTBB without concerns over patent or copyright issues. 
-
-The DCO is an attestation attached to every contribution made by every developer. In the commit message of the contribution, (described later), the developer simply adds a Signed-off-by statement and thereby agrees to the DCO. 
+Some imported or reused components within oneTBB use other licenses, as described in [third-party-programs.txt](https://github.com/oneapi-src/oneTBB/blob/master/third-party-programs.txt). By carefully reviewing potential contributions, we can ensure that the community can develop products with oneTBB without concerns over patent or copyright issues. 
 
 ## Prerequisites 
 
@@ -32,12 +30,6 @@ As a contributor, you’ll want to be familiar with the oneTBB project and the r
 ## Pull Requests 
 
 You can find all [open oneTBB pull requests](https://github.com/oneapi-src/oneTBB/pulls) on GitHub. 
-
-No anonymous contributions are accepted. The name in the commit message Signed-off-by line and your email must match the change authorship information.  Make sure your .gitconfig is set up correctly so you can use `git commit -s` for signing your patches: 
-
-`git config --global user.name "Taylor Developer"`
-
-`git config --global user.email taylor.developer@company.com`
  
 ### Before contributing changes directly to the oneTBB repository
 
diff --git a/MODULE.bazel b/MODULE.bazel
index cc6698f0de..063bc2f468 100644
--- a/MODULE.bazel
+++ b/MODULE.bazel
@@ -21,4 +21,4 @@ module(
     compatibility_level = 1,
 )
 
-bazel_dep(name = "platforms", version = "0.0.9")
+bazel_dep(name = "platforms", version = "0.0.10")
diff --git a/MODULE.bazel.lock b/MODULE.bazel.lock
new file mode 100644
index 0000000000..06f9098032
--- /dev/null
+++ b/MODULE.bazel.lock
@@ -0,0 +1,65 @@
+{
+  "lockFileVersion": 11,
+  "registryFileHashes": {
+    "https://bcr.bazel.build/bazel_registry.json": "8a28e4aff06ee60aed2a8c281907fb8bcbf3b753c91fb5a5c57da3215d5b3497",
+    "https://bcr.bazel.build/modules/abseil-cpp/20210324.2/MODULE.bazel": "7cd0312e064fde87c8d1cd79ba06c876bd23630c83466e9500321be55c96ace2",
+    "https://bcr.bazel.build/modules/abseil-cpp/20211102.0/MODULE.bazel": "70390338f7a5106231d20620712f7cccb659cd0e9d073d1991c038eb9fc57589",
+    "https://bcr.bazel.build/modules/abseil-cpp/20211102.0/source.json": "7e3a9adf473e9af076ae485ed649d5641ad50ec5c11718103f34de03170d94ad",
+    "https://bcr.bazel.build/modules/apple_support/1.5.0/MODULE.bazel": "50341a62efbc483e8a2a6aec30994a58749bd7b885e18dd96aa8c33031e558ef",
+    "https://bcr.bazel.build/modules/apple_support/1.5.0/source.json": "eb98a7627c0bc486b57f598ad8da50f6625d974c8f723e9ea71bd39f709c9862",
+    "https://bcr.bazel.build/modules/bazel_features/1.11.0/MODULE.bazel": "f9382337dd5a474c3b7d334c2f83e50b6eaedc284253334cf823044a26de03e8",
+    "https://bcr.bazel.build/modules/bazel_features/1.11.0/source.json": "c9320aa53cd1c441d24bd6b716da087ad7e4ff0d9742a9884587596edfe53015",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.0.3/MODULE.bazel": "bcb0fd896384802d1ad283b4e4eb4d718eebd8cb820b0a2c3a347fb971afd9d8",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.2.1/MODULE.bazel": "f35baf9da0efe45fa3da1696ae906eea3d615ad41e2e3def4aeb4e8bc0ef9a7a",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.3.0/MODULE.bazel": "20228b92868bf5cfc41bda7afc8a8ba2a543201851de39d990ec957b513579c5",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.6.1/MODULE.bazel": "8fdee2dbaace6c252131c00e1de4b165dc65af02ea278476187765e1a617b917",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.6.1/source.json": "082ed5f9837901fada8c68c2f3ddc958bb22b6d654f71dd73f3df30d45d4b749",
+    "https://bcr.bazel.build/modules/buildozer/7.1.2/MODULE.bazel": "2e8dd40ede9c454042645fd8d8d0cd1527966aa5c919de86661e62953cd73d84",
+    "https://bcr.bazel.build/modules/buildozer/7.1.2/source.json": "c9028a501d2db85793a6996205c8de120944f50a0d570438fcae0457a5f9d1f8",
+    "https://bcr.bazel.build/modules/googletest/1.11.0/MODULE.bazel": "3a83f095183f66345ca86aa13c58b59f9f94a2f81999c093d4eeaa2d262d12f4",
+    "https://bcr.bazel.build/modules/googletest/1.11.0/source.json": "c73d9ef4268c91bd0c1cd88f1f9dfa08e814b1dbe89b5f594a9f08ba0244d206",
+    "https://bcr.bazel.build/modules/platforms/0.0.10/MODULE.bazel": "8cb8efaf200bdeb2150d93e162c40f388529a25852b332cec879373771e48ed5",
+    "https://bcr.bazel.build/modules/platforms/0.0.10/source.json": "f22828ff4cf021a6b577f1bf6341cb9dcd7965092a439f64fc1bb3b7a5ae4bd5",
+    "https://bcr.bazel.build/modules/platforms/0.0.4/MODULE.bazel": "9b328e31ee156f53f3c416a64f8491f7eb731742655a47c9eec4703a71644aee",
+    "https://bcr.bazel.build/modules/platforms/0.0.5/MODULE.bazel": "5733b54ea419d5eaf7997054bb55f6a1d0b5ff8aedf0176fef9eea44f3acda37",
+    "https://bcr.bazel.build/modules/platforms/0.0.6/MODULE.bazel": "ad6eeef431dc52aefd2d77ed20a4b353f8ebf0f4ecdd26a807d2da5aa8cd0615",
+    "https://bcr.bazel.build/modules/platforms/0.0.7/MODULE.bazel": "72fd4a0ede9ee5c021f6a8dd92b503e089f46c227ba2813ff183b71616034814",
+    "https://bcr.bazel.build/modules/platforms/0.0.9/MODULE.bazel": "4a87a60c927b56ddd67db50c89acaa62f4ce2a1d2149ccb63ffd871d5ce29ebc",
+    "https://bcr.bazel.build/modules/protobuf/21.7/MODULE.bazel": "a5a29bb89544f9b97edce05642fac225a808b5b7be74038ea3640fae2f8e66a7",
+    "https://bcr.bazel.build/modules/protobuf/21.7/source.json": "bbe500720421e582ff2d18b0802464205138c06056f443184de39fbb8187b09b",
+    "https://bcr.bazel.build/modules/protobuf/3.19.0/MODULE.bazel": "6b5fbb433f760a99a22b18b6850ed5784ef0e9928a72668b66e4d7ccd47db9b0",
+    "https://bcr.bazel.build/modules/protobuf/3.19.6/MODULE.bazel": "9233edc5e1f2ee276a60de3eaa47ac4132302ef9643238f23128fea53ea12858",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.1/MODULE.bazel": "cb2aa0747f84c6c3a78dad4e2049c154f08ab9d166b1273835a8174940365647",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.2/MODULE.bazel": "6915987c90970493ab97393024c156ea8fb9f3bea953b2f3ec05c34f19b5695c",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.8/MODULE.bazel": "964c85c82cfeb6f3855e6a07054fdb159aced38e99a5eecf7bce9d53990afa3e",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.9/MODULE.bazel": "836e76439f354b89afe6a911a7adf59a6b2518fafb174483ad78a2a2fde7b1c5",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.9/source.json": "1f1ba6fea244b616de4a554a0f4983c91a9301640c8fe0dd1d410254115c8430",
+    "https://bcr.bazel.build/modules/rules_java/4.0.0/MODULE.bazel": "5a78a7ae82cd1a33cef56dc578c7d2a46ed0dca12643ee45edbb8417899e6f74",
+    "https://bcr.bazel.build/modules/rules_java/7.6.1/MODULE.bazel": "2f14b7e8a1aa2f67ae92bc69d1ec0fa8d9f827c4e17ff5e5f02e91caa3b2d0fe",
+    "https://bcr.bazel.build/modules/rules_java/7.6.1/source.json": "8f3f3076554e1558e8e468b2232991c510ecbcbed9e6f8c06ac31c93bcf38362",
+    "https://bcr.bazel.build/modules/rules_jvm_external/4.4.2/MODULE.bazel": "a56b85e418c83eb1839819f0b515c431010160383306d13ec21959ac412d2fe7",
+    "https://bcr.bazel.build/modules/rules_jvm_external/4.4.2/source.json": "a075731e1b46bc8425098512d038d416e966ab19684a10a34f4741295642fc35",
+    "https://bcr.bazel.build/modules/rules_license/0.0.3/MODULE.bazel": "627e9ab0247f7d1e05736b59dbb1b6871373de5ad31c3011880b4133cafd4bd0",
+    "https://bcr.bazel.build/modules/rules_license/0.0.7/MODULE.bazel": "088fbeb0b6a419005b89cf93fe62d9517c0a2b8bb56af3244af65ecfe37e7d5d",
+    "https://bcr.bazel.build/modules/rules_license/0.0.7/source.json": "355cc5737a0f294e560d52b1b7a6492d4fff2caf0bef1a315df5a298fca2d34a",
+    "https://bcr.bazel.build/modules/rules_pkg/0.7.0/MODULE.bazel": "df99f03fc7934a4737122518bb87e667e62d780b610910f0447665a7e2be62dc",
+    "https://bcr.bazel.build/modules/rules_pkg/0.7.0/source.json": "c2557066e0c0342223ba592510ad3d812d4963b9024831f7f66fd0584dd8c66c",
+    "https://bcr.bazel.build/modules/rules_proto/4.0.0/MODULE.bazel": "a7a7b6ce9bee418c1a760b3d84f83a299ad6952f9903c67f19e4edd964894e06",
+    "https://bcr.bazel.build/modules/rules_proto/5.3.0-21.7/MODULE.bazel": "e8dff86b0971688790ae75528fe1813f71809b5afd57facb44dad9e8eca631b7",
+    "https://bcr.bazel.build/modules/rules_proto/5.3.0-21.7/source.json": "d57902c052424dfda0e71646cb12668d39c4620ee0544294d9d941e7d12bc3a9",
+    "https://bcr.bazel.build/modules/rules_python/0.10.2/MODULE.bazel": "cc82bc96f2997baa545ab3ce73f196d040ffb8756fd2d66125a530031cd90e5f",
+    "https://bcr.bazel.build/modules/rules_python/0.22.1/MODULE.bazel": "26114f0c0b5e93018c0c066d6673f1a2c3737c7e90af95eff30cfee38d0bbac7",
+    "https://bcr.bazel.build/modules/rules_python/0.22.1/source.json": "57226905e783bae7c37c2dd662be078728e48fa28ee4324a7eabcafb5a43d014",
+    "https://bcr.bazel.build/modules/rules_python/0.4.0/MODULE.bazel": "9208ee05fd48bf09ac60ed269791cf17fb343db56c8226a720fbb1cdf467166c",
+    "https://bcr.bazel.build/modules/stardoc/0.5.1/MODULE.bazel": "1a05d92974d0c122f5ccf09291442580317cdd859f07a8655f1db9a60374f9f8",
+    "https://bcr.bazel.build/modules/stardoc/0.5.1/source.json": "a96f95e02123320aa015b956f29c00cb818fa891ef823d55148e1a362caacf29",
+    "https://bcr.bazel.build/modules/upb/0.0.0-20220923-a547704/MODULE.bazel": "7298990c00040a0e2f121f6c32544bab27d4452f80d9ce51349b1a28f3005c43",
+    "https://bcr.bazel.build/modules/upb/0.0.0-20220923-a547704/source.json": "f1ef7d3f9e0e26d4b23d1c39b5f5de71f584dd7d1b4ef83d9bbba6ec7a6a6459",
+    "https://bcr.bazel.build/modules/zlib/1.2.11/MODULE.bazel": "07b389abc85fdbca459b69e2ec656ae5622873af3f845e1c9d80fe179f3effa0",
+    "https://bcr.bazel.build/modules/zlib/1.2.12/MODULE.bazel": "3b1a8834ada2a883674be8cbd36ede1b6ec481477ada359cd2d3ddc562340b27",
+    "https://bcr.bazel.build/modules/zlib/1.3/MODULE.bazel": "6a9c02f19a24dcedb05572b2381446e27c272cd383aed11d41d99da9e3167a72",
+    "https://bcr.bazel.build/modules/zlib/1.3/source.json": "b6b43d0737af846022636e6e255fd4a96fee0d34f08f3830e6e0bac51465c37c"
+  },
+  "selectedYankedVersions": {},
+  "moduleExtensions": {}
+}
diff --git a/README.md b/README.md
index f2bc0a0afa..2e7c2e81ba 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,8 @@
-# oneAPI Threading Building Blocks
+# oneAPI Threading Building Blocks (oneTBB) <img align="right" width="200" height="100" src="https://raw.githubusercontent.com/uxlfoundation/artwork/e98f1a7a3d305c582d02c5f532e41487b710d470/foundation/uxl-foundation-logo-horizontal-color.svg">
 [![Apache License Version 2.0](https://img.shields.io/badge/license-Apache_2.0-green.svg)](LICENSE.txt) [![oneTBB CI](https://github.com/oneapi-src/oneTBB/actions/workflows/ci.yml/badge.svg)](https://github.com/oneapi-src/oneTBB/actions/workflows/ci.yml?query=branch%3Amaster)
+[![Join the community on GitHub Discussions](https://badgen.net/badge/join%20the%20discussion/on%20github/blue?icon=github)](https://github.com/oneapi-src/oneTBB/discussions)
+[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/9125/badge)](https://www.bestpractices.dev/projects/9125)
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/oneapi-src/oneTBB/badge)](https://securityscorecards.dev/viewer/?uri=github.com/oneapi-src/oneTBB)
 
 oneTBB is a flexible C++ library that simplifies the work of adding parallelism
 to complex applications, even if you are not a threading expert.  
@@ -18,7 +21,7 @@ The library differs from typical threading packages in the following ways:
 
 Refer to oneTBB [examples](examples) and [samples](https://github.com/oneapi-src/oneAPI-samples/tree/master/Libraries/oneTBB) to see how you can use the library.
 
-oneTBB is a part of [oneAPI](https://oneapi.io). The current branch implements version 1.1 of oneAPI Specification.
+oneTBB is a part of the [UXL Foundation](http://www.uxlfoundation.org) and is an implementation of [oneAPI specification](https://oneapi.io).
 
 > **_NOTE:_** Threading Building Blocks (TBB) is now called oneAPI Threading Building Blocks (oneTBB) to highlight that the tool is a part of the oneAPI ecosystem.
 
@@ -39,6 +42,14 @@ See [Release Notes](RELEASE_NOTES.md) and [System Requirements](SYSTEM_REQUIREME
 ## Installation 
 See [Installation from Sources](INSTALL.md) to learn how to install oneTBB. 
 
+## Governance
+
+The oneTBB project is governed by the UXL Foundation.
+You can get involved in this project in following ways:
+* Join the [Open Source and Specification Working Group](https://github.com/uxlfoundation/foundation/tree/main?tab=readme-ov-file#working-groups) meetings.
+* Join the mailing lists for the [UXL Foundation](https://lists.uxlfoundation.org/g/main/subgroups) to receive meetings schedule and latest updates.
+* Contribute to oneTBB project or oneTBB specification. Read [CONTRIBUTING](./CONTRIBUTING.md) for more information.
+
 ## Support
 See our [documentation](./SUPPORT.md) to learn how to request help.
 
@@ -46,6 +57,8 @@ See our [documentation](./SUPPORT.md) to learn how to request help.
 We welcome community contributions, so check our [Contributing Guidelines](CONTRIBUTING.md)
 to learn more.
 
+Use GitHub Issues for feature requests, bug reports, and minor inquiries. For broader questions and development-related discussions, use GitHub Discussions.
+
 ## License
 oneAPI Threading Building Blocks is licensed under [Apache License, Version 2.0](LICENSE.txt).
 By its terms, contributions submitted to the project are also done under that license.
diff --git a/WASM_Support.md b/WASM_Support.md
index 8c2f6c1af9..6306620d7c 100644
--- a/WASM_Support.md
+++ b/WASM_Support.md
@@ -16,7 +16,7 @@
 
 # WASM Support
 
-oneTBB extends its capabilities by offering robust support for ``WASM``. 
+oneTBB extends its capabilities by offering robust support for ``WASM`` (see ``Limitation`` sections).
 
 ``WASM`` stands for WebAssembly, a low-level binary format for executing code in web browsers. 
 It is designed to be a portable target for compilers and efficient to parse and execute. 
@@ -58,3 +58,24 @@ To run tests, use:
 ctest
 ```
 
+# Limitations
+
+You can successfully build your application with oneTBB using WASM, but you may not achieve optimal performance immediately. This is due to the limitation with nested Web Workers: a Web Worker cannot schedule another worker without help from a browser thread. This can lead to unexpected performance outcomes, such as the application running in serial.
+Find more information in the [issue](https://github.com/emscripten-core/emscripten/discussions/21963) in the Emscripten repository.
+To workaround this issue, try one of the following ways:
+1. **Recommended Solution: Use the ``-sPROXY_TO_PTHREAD`` Flag**. 
+This flag splits the initial thread into a browser thread and a main thread (proxied by a Web Worker), effectively resolving the issue as the browser thread is always present in the event loop and can participate in Web Workers scheduling. Refer to the [Emscripten documentation](https://emscripten.org/docs/porting/pthreads.html) for more details about ``-sPROXY_TO_PTHREAD`` since using this flag may require refactoring the code.
+2. **Alternative Solution: Warm Up the oneTBB Thread Pool**
+Initialize the oneTBB thread pool before making the first call to oneTBB. This approach forces the browser thread to participate in Web Workers scheduling.
+```cpp
+    int num_threads = tbb::this_task_arena::max_concurrency();
+    std::atomic<int> barrier{num_threads};
+    tbb::parallel_for(0, num_threads, [&barrier] (int) {
+        barrier--;
+        while (barrier > 0) {
+            // Send browser thread to event loop
+            std::this_thread::yield();
+        }
+    }, tbb::static_partitioner{});
+```
+> **_NOTE:_** Be aware that it might cause delays on the browser side.
diff --git a/cmake/README.md b/cmake/README.md
index aa811b0fc0..3a357218d5 100644
--- a/cmake/README.md
+++ b/cmake/README.md
@@ -10,6 +10,7 @@ TBB_SANITIZE:STRING - Sanitizer parameter, passed to compiler/linker
 TBB_SIGNTOOL:FILEPATH - Tool for digital signing, used in post-install step for libraries if provided.
 TBB_SIGNTOOL_ARGS:STRING - Additional arguments for TBB_SIGNTOOL, used if TBB_SIGNTOOL is set.
 TBB_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) build (ON by default)
+TBB_FIND_PACKAGE - Enable search for external oneTBB using find_package instead of build from sources (OFF by default)
 TBBMALLOC_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) memory allocator build (ON by default)
 TBBMALLOC_PROXY_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) memory allocator proxy build (requires TBBMALLOC_BUILD. ON by default)
 TBB4PY_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) Python module build (OFF by default)
@@ -187,7 +188,14 @@ cmake --build . --target test # currently does not work on Windows* OS
 ```
 
 ## Installation
-See [Installation from Sources](../INSTALL.md) to learn how to install oneTBB. 
+See [Installation from Sources](../INSTALL.md) to learn how to install oneTBB.
+
+To install oneTBB from the release packages, use the following commands: 
+```bash
+tar -xvf oneapi-tbb-xxx.xx.x-*.tgz
+source env/vars.sh
+```
+
 
 ## Sanitizers - Configure, Build, and Run
 
diff --git a/cmake/compilers/Clang.cmake b/cmake/compilers/Clang.cmake
index f56b5fba0f..dcd66634f3 100644
--- a/cmake/compilers/Clang.cmake
+++ b/cmake/compilers/Clang.cmake
@@ -13,12 +13,16 @@
 # limitations under the License.
 
 if (EMSCRIPTEN)
-  set(TBB_EMSCRIPTEN 1)
-  set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fexceptions)
-  set(TBB_TEST_LINK_FLAGS  ${TBB_COMMON_LINK_FLAGS} -fexceptions -sINITIAL_MEMORY=65536000 -sALLOW_MEMORY_GROWTH=1 -sEXIT_RUNTIME=1)
-  if (NOT EMSCRIPTEN_WITHOUT_PTHREAD)
-      set_property(TARGET Threads::Threads PROPERTY INTERFACE_LINK_LIBRARIES "-pthread")
-  endif()
+    set(TBB_EMSCRIPTEN 1)
+    set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fexceptions)
+    set(TBB_TEST_LINK_FLAGS  ${TBB_COMMON_LINK_FLAGS} -fexceptions -sINITIAL_MEMORY=65536000 -sALLOW_MEMORY_GROWTH=1 -sMALLOC=mimalloc -sEXIT_RUNTIME=1)
+    if (NOT EMSCRIPTEN_WITHOUT_PTHREAD)
+        set_property(TARGET Threads::Threads PROPERTY INTERFACE_LINK_LIBRARIES "-pthread")
+    endif()
+    set(TBB_EMSCRIPTEN_STACK_SIZE 65536)
+    set(TBB_LIB_COMPILE_FLAGS -D__TBB_EMSCRIPTEN_STACK_SIZE=${TBB_EMSCRIPTEN_STACK_SIZE})
+    set(TBB_TEST_LINK_FLAGS ${TBB_TEST_LINK_FLAGS} -sTOTAL_STACK=${TBB_EMSCRIPTEN_STACK_SIZE})
+    unset(TBB_EMSCRIPTEN_STACK_SIZE)
 endif()
 
 if (MINGW)
diff --git a/cmake/compilers/GNU.cmake b/cmake/compilers/GNU.cmake
index 6fd8d9808d..cf6d8bdbca 100644
--- a/cmake/compilers/GNU.cmake
+++ b/cmake/compilers/GNU.cmake
@@ -35,8 +35,39 @@ if (NOT CMAKE_GENERATOR MATCHES "Ninja" AND NOT CMAKE_CXX_DEPENDS_USE_COMPILER)
     set(TBB_MMD_FLAG -MMD)
 endif()
 
+
+# Binutils < 2.31.1 do not support the tpause instruction. When compiling with
+# a modern version of GCC (supporting it) but relying on an outdated assembler,
+# will result in an error reporting "no such instruction: tpause".
+# The following code invokes the GNU assembler to extract the version number
+# and convert it to an integer that can be used in the C++ code to compare
+# against, and conditionally disable the __TBB_WAITPKG_INTRINSICS_PRESENT
+# macro if the version is incompatible. Binutils only report the version in the
+# MAJOR.MINOR format, therefore the version checked is >=2.32 (instead of
+# >=2.31.1). Capturing the output in CMake can be done like below. The version
+# information is written to either stdout or stderr. To not make any
+# assumptions, both are captured.
+execute_process(
+    COMMAND ${CMAKE_COMMAND} -E env "LANG=C" ${CMAKE_CXX_COMPILER} -xc -c /dev/null -Wa,-v -o/dev/null
+    OUTPUT_VARIABLE ASSEMBLER_VERSION_LINE_OUT
+    ERROR_VARIABLE ASSEMBLER_VERSION_LINE_ERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_STRIP_TRAILING_WHITESPACE
+)
+set(ASSEMBLER_VERSION_LINE ${ASSEMBLER_VERSION_LINE_OUT}${ASSEMBLER_VERSION_LINE_ERR})
+string(REGEX REPLACE ".*GNU assembler version ([0-9]+)\\.([0-9]+).*" "\\1" _tbb_gnu_asm_major_version "${ASSEMBLER_VERSION_LINE}")
+string(REGEX REPLACE ".*GNU assembler version ([0-9]+)\\.([0-9]+).*" "\\2" _tbb_gnu_asm_minor_version "${ASSEMBLER_VERSION_LINE}")
+unset(ASSEMBLER_VERSION_LINE_OUT)
+unset(ASSEMBLER_VERSION_LINE_ERR)
+unset(ASSEMBLER_VERSION_LINE)
+message(TRACE "Extracted GNU assembler version: major=${_tbb_gnu_asm_major_version} minor=${_tbb_gnu_asm_minor_version}")
+
+math(EXPR _tbb_gnu_asm_version_number  "${_tbb_gnu_asm_major_version} * 1000 + ${_tbb_gnu_asm_minor_version}")
+set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} "-D__TBB_GNU_ASM_VERSION=${_tbb_gnu_asm_version_number}")
+message(STATUS "GNU Assembler version: ${_tbb_gnu_asm_major_version}.${_tbb_gnu_asm_minor_version}  (${_tbb_gnu_asm_version_number})")
+
 # Enable Intel(R) Transactional Synchronization Extensions (-mrtm) and WAITPKG instructions support (-mwaitpkg) on relevant processors
-if (CMAKE_SYSTEM_PROCESSOR MATCHES "(AMD64|amd64|i.86|x86)")
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "(AMD64|amd64|i.86|x86)" AND NOT EMSCRIPTEN)
     set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -mrtm $<$<AND:$<NOT:$<CXX_COMPILER_ID:Intel>>,$<NOT:$<VERSION_LESS:${CMAKE_CXX_COMPILER_VERSION},11.0>>>:-mwaitpkg>)
 endif()
 
@@ -51,6 +82,9 @@ if (NOT ${CMAKE_CXX_COMPILER_ID} STREQUAL Intel)
     # gcc 6.0 and later have -flifetime-dse option that controls elimination of stores done outside the object lifetime
     set(TBB_DSE_FLAG $<$<NOT:$<VERSION_LESS:${CMAKE_CXX_COMPILER_VERSION},6.0>>:-flifetime-dse=1>)
     set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$<NOT:$<VERSION_LESS:${CMAKE_CXX_COMPILER_VERSION},8.0>>:-fstack-clash-protection>)
+
+    # Suppress GCC 12.x-13.x warning here that to_wait_node(n)->my_is_in_list might have size 0
+    set(TBB_COMMON_LINK_FLAGS ${TBB_COMMON_LINK_FLAGS} $<$<AND:$<NOT:$<VERSION_LESS:${CMAKE_CXX_COMPILER_VERSION},12.0>>,$<VERSION_LESS:${CMAKE_CXX_COMPILER_VERSION},14.0>>:-Wno-stringop-overflow>)
 endif()
 
 # Workaround for heavy tests and too many symbols in debug (rellocation truncated to fit: R_MIPS_CALL16)
diff --git a/cmake/compilers/IntelLLVM.cmake b/cmake/compilers/IntelLLVM.cmake
index a9ebb3e670..b514378164 100644
--- a/cmake/compilers/IntelLLVM.cmake
+++ b/cmake/compilers/IntelLLVM.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,6 +20,9 @@ if (WIN32)
 else()
     include(${CMAKE_CURRENT_LIST_DIR}/Clang.cmake)
     set(TBB_IPO_COMPILE_FLAGS $<$<NOT:$<CONFIG:Debug>>:-ipo>)
+     # "--exclude-libs,ALL" is used to avoid accidental exporting of symbols
+    #  from statically linked libraries
+    set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -static-intel -Wl,--exclude-libs,ALL)
     set(TBB_OPENMP_FLAG -qopenmp)
 endif()
 set(TBB_IPO_LINK_FLAGS ${TBB_IPO_LINK_FLAGS} ${TBB_IPO_COMPILE_FLAGS})
diff --git a/doc/conf.py b/doc/conf.py
index 19da0a4caf..a0ef593b9b 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -29,7 +29,7 @@
     project = u'Intel® oneAPI Threading Building Blocks (oneTBB)'
 else:
     project = u'oneTBB'
-copyright = u'2023, Intel Corporation'
+copyright = u'Intel Corporation'
 author = u'Intel'
 
 # The short X.Y version
@@ -304,7 +304,7 @@
 # -- Options for intersphinx extension ---------------------------------------
 
 # Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {'https://docs.python.org/': None}
+intersphinx_mapping = {'python': ('https://docs.python.org/3', None)}
 
 # -- Options for todo extension ----------------------------------------------
 
diff --git a/doc/main/reference/custom_mutex_chmap.rst b/doc/main/reference/custom_mutex_chmap.rst
index 152320fd65..acf502e66d 100644
--- a/doc/main/reference/custom_mutex_chmap.rst
+++ b/doc/main/reference/custom_mutex_chmap.rst
@@ -50,7 +50,7 @@ Type requirements
 -----------------
 
 The type of the mutex passed as a template argument for ``concurrent_hash_map`` should
-meet the requirements of `ReaderWriterMutex <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/named_requirements/mutexes/rw_mutex.html>`_.
+meet the requirements of `ReaderWriterMutex <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/mutexes/rw_mutex>`_.
 It should also provide the following API:
 
 .. cpp:function:: bool ReaderWriterMutex::scoped_lock::is_writer() const;
diff --git a/doc/main/reference/parallel_for_each_semantics.rst b/doc/main/reference/parallel_for_each_semantics.rst
index 1f8815b3b3..c007066b3b 100644
--- a/doc/main/reference/parallel_for_each_semantics.rst
+++ b/doc/main/reference/parallel_for_each_semantics.rst
@@ -10,7 +10,7 @@ parallel_for_each Body semantics and requirements
 Description
 ***********
 
-This page clarifies `ParallelForEachBody <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/named_requirements/algorithms/par_for_each_body.html>`_
+This page clarifies `ParallelForEachBody <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/algorithms/par_for_each_body>`_
 named requirements for ``tbb::parallel_for_each`` algorithm specification.
 
 .. code:: cpp
diff --git a/doc/main/reference/parallel_sort_ranges_extension.rst b/doc/main/reference/parallel_sort_ranges_extension.rst
index 52f2283a17..cad65b54b0 100644
--- a/doc/main/reference/parallel_sort_ranges_extension.rst
+++ b/doc/main/reference/parallel_sort_ranges_extension.rst
@@ -10,7 +10,7 @@ parallel_sort ranges interface extension
 Description
 ***********
 
-|full_name| implementation extends the `oneapi::tbb::parallel_sort specification <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_sort_func.html>`_
+|full_name| implementation extends the `oneapi::tbb::parallel_sort specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/algorithms/functions/parallel_sort_func>`_
 with overloads that takes the container by forwarding reference.
 
 
diff --git a/doc/main/reference/reference.rst b/doc/main/reference/reference.rst
index 833a50ee70..c8ba0af944 100644
--- a/doc/main/reference/reference.rst
+++ b/doc/main/reference/reference.rst
@@ -3,13 +3,13 @@
 |short_name| API Reference
 ==========================
 
-For oneTBB API Reference, refer to `oneAPI Specification <https://spec.oneapi.com/>`_. The current supported
+For oneTBB API Reference, refer to `oneAPI Specification <https://github.com/uxlfoundation/oneAPI-spec>`_. The current supported
 version of oneAPI Specification is 1.0.
 
 Specification extensions
 ************************
 
-|full_name| implements the `oneTBB specification <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/nested-index.html>`_.
+|full_name| implements the `oneTBB specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/nested-index>`_.
 This document provides additional details or restrictions where necessary.
 It also describes features that are not included in the oneTBB specification.
 
diff --git a/doc/main/reference/rvalue_reduce.rst b/doc/main/reference/rvalue_reduce.rst
index 53880952aa..69d480d465 100644
--- a/doc/main/reference/rvalue_reduce.rst
+++ b/doc/main/reference/rvalue_reduce.rst
@@ -10,8 +10,8 @@ Parallel Reduction for rvalues
 Description
 ***********
 
-|full_name| implementation extends the `ParallelReduceFunc <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/named_requirements/algorithms/par_reduce_func.html>`_ and
-`ParallelReduceReduction <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/named_requirements/algorithms/par_reduce_reduction.html>`_
+|full_name| implementation extends the `ParallelReduceFunc <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/algorithms/par_reduce_func>`_ and
+`ParallelReduceReduction <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/algorithms/par_reduce_reduction>`_
 to optimize operating with ``rvalues`` using functional form of ``tbb::parallel_reduce`` and ``tbb::parallel_deterministic_reduce`` algorithms.
 
 API
@@ -33,8 +33,8 @@ or
 
 .. cpp:function:: Value Func::operator()(const Range& range, const Value& x) const
 
-    Accumulates the result for a subrange, starting with initial value ``x``. The ``Range`` type must meet the `Range requirements <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/named_requirements/algorithms/range.html>_`.
-    The ``Value`` type must be the same as a corresponding template parameter for the `parallel_reduce algorithm <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_reduce_func.html>`_.
+    Accumulates the result for a subrange, starting with initial value ``x``. The ``Range`` type must meet the `Range requirements <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/algorithms/range>_`.
+    The ``Value`` type must be the same as a corresponding template parameter for the `parallel_reduce algorithm <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/algorithms/functions/parallel_reduce_func>`_.
 
     If both ``rvalue`` and ``lvalue`` forms are provided, the ``rvalue`` is preferred.
 
@@ -47,7 +47,7 @@ or
 
 .. cpp:function:: Value Reduction::operator()(const Value& x, const Value& y) const
 
-    Combines the ``x`` and ``y`` results. The ``Value`` type must be the same as a corresponding template parameter for the `parallel_reduce algorithm <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_reduce_func.html>`_.
+    Combines the ``x`` and ``y`` results. The ``Value`` type must be the same as a corresponding template parameter for the `parallel_reduce algorithm <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/algorithms/functions/parallel_reduce_func>`_.
 
     If both ``rvalue`` and ``lvalue`` forms are provided, the ``rvalue`` is preferred.
 
@@ -83,7 +83,7 @@ Example
 
 .. rubric:: See also
 
-* `oneapi::tbb::parallel_reduce specification <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_reduce_func.html>`_
-* `oneapi::tbb::parallel_deterministic_reduce specification <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_deterministic_reduce_func.html>`_
-* `ParallelReduceFunc specification <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/named_requirements/algorithms/par_reduce_func.html>`_
-* `ParallelReduceReduction specification <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/named_requirements/algorithms/par_reduce_reduction.html>`_
+* `oneapi::tbb::parallel_reduce specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/algorithms/functions/parallel_reduce_func>`_
+* `oneapi::tbb::parallel_deterministic_reduce specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/algorithms/functions/parallel_deterministic_reduce_func>`_
+* `ParallelReduceFunc specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/algorithms/par_reduce_func>`_
+* `ParallelReduceReduction specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/algorithms/par_reduce_reduction>`_
diff --git a/doc/main/reference/task_group_extensions.rst b/doc/main/reference/task_group_extensions.rst
index 10d3980161..47795f9574 100644
--- a/doc/main/reference/task_group_extensions.rst
+++ b/doc/main/reference/task_group_extensions.rst
@@ -13,7 +13,7 @@ task_group extensions
 Description
 ***********
 
-|full_name| implementation extends the `tbb::task_group specification <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/task_group/task_group_cls.html>`_ with the requirements for a user-provided function object.
+|full_name| implementation extends the `tbb::task_group specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/task_group/task_group_cls>`_ with the requirements for a user-provided function object.
    
 
 API
@@ -83,7 +83,7 @@ As an optimization hint, ``F`` might return a ``task_handle``, which task object
                
 .. rubric:: See also
 
-* `oneapi::tbb::task_group specification <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/task_group/task_group_cls.html>`_
-* `oneapi::tbb::task_group_context specification <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/scheduling_controls/task_group_context_cls.html>`_
-* `oneapi::tbb::task_group_status specification <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/task_group/task_group_status_enum.html>`_ 
-* `oneapi::tbb::task_handle class <https://oneapi-src.github.io/oneAPI-spec/spec/elements/oneTBB/source/task_scheduler/task_group/task_handle.html>`_
+* `oneapi::tbb::task_group specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/task_group/task_group_cls>`_
+* `oneapi::tbb::task_group_context specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/scheduling_controls/task_group_context_cls>`_
+* `oneapi::tbb::task_group_status specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/task_group/task_group_status_enum>`_ 
+* `oneapi::tbb::task_handle class <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/task_group/task_handle>`_
diff --git a/doc/main/reference/type_specified_message_keys.rst b/doc/main/reference/type_specified_message_keys.rst
index 3b1dbc56fa..a50cd7f434 100644
--- a/doc/main/reference/type_specified_message_keys.rst
+++ b/doc/main/reference/type_specified_message_keys.rst
@@ -66,4 +66,4 @@ lookup and used in place of the default implementation.
 See Also
 ********
 
-`join_node Specification <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/flow_graph/join_node_cls.html>`_
+`join_node Specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/flow_graph/join_node_cls>`_
diff --git a/doc/main/tbb_userguide/Constraints.rst b/doc/main/tbb_userguide/Constraints.rst
index d37ce12028..1928fe8eeb 100644
--- a/doc/main/tbb_userguide/Constraints.rst
+++ b/doc/main/tbb_userguide/Constraints.rst
@@ -4,7 +4,7 @@ Constrained APIs
 ================
 
 Starting from C++20, most of |full_name| APIs are constrained to
-enforce `named requirements <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/named_requirements.html>`_ on
+enforce `named requirements <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements>`_ on
 template arguments types.
 
 The violations of these requirements are detected at a compile time during the template instantiation.
diff --git a/doc/main/tbb_userguide/How_Task_Scheduler_Works.rst b/doc/main/tbb_userguide/How_Task_Scheduler_Works.rst
index 5ad1670baa..744794fc07 100644
--- a/doc/main/tbb_userguide/How_Task_Scheduler_Works.rst
+++ b/doc/main/tbb_userguide/How_Task_Scheduler_Works.rst
@@ -7,7 +7,7 @@ How Task Scheduler Works
 While the task scheduler is not bound to any particular type of parallelism, 
 it was designed to work efficiently for fork-join parallelism with lots of forks.
 This type of parallelism is typical for parallel algorithms such as `oneapi::tbb::parallel_for
-<https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_for_func.html>`_.
+<https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/algorithms/functions/parallel_for_func>`_.
 
 Let's consider the mapping of fork-join parallelism on the task scheduler in more detail. 
 
diff --git a/doc/main/tbb_userguide/Migration_Guide/Task_Scheduler_Init.rst b/doc/main/tbb_userguide/Migration_Guide/Task_Scheduler_Init.rst
index aa8658acf8..6acdb272eb 100644
--- a/doc/main/tbb_userguide/Migration_Guide/Task_Scheduler_Init.rst
+++ b/doc/main/tbb_userguide/Migration_Guide/Task_Scheduler_Init.rst
@@ -14,27 +14,27 @@ Querying the default number of threads
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 * `oneapi::tbb::info::default_concurrency()
-  <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/info_namespace.html>`_
+  <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/info_namespace>`_
   returns the maximum concurrency that will be created by *default* in implicit or explicit ``task_arena``.
 
 * `oneapi::tbb::this_task_arena::max_concurrency()
-  <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/task_arena/this_task_arena_ns.html>`_
+  <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/task_arena/this_task_arena_ns>`_
   returns the maximum number of threads available for the parallel algorithms within the current context
   (or *default* if an implicit arena is not initialized)
 
 * `oneapi::tbb::global_control::active_value(tbb::global_control::max_allowed_parallelism)
-  <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/scheduling_controls/global_control_cls.html>`_
+  <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/scheduling_controls/global_control_cls>`_
   returns the current limit of the thread pool (or *default* if oneTBB scheduler is not initialized)
 
 Setting the maximum concurrency
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 * `task_arena(/* max_concurrency */)
-  <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/task_arena/this_task_arena_ns.html>`_
+  <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/task_arena/this_task_arena_ns>`_
   limits the maximum concurrency of the parallel algorithm running inside ``task_arena``
 
 * `tbb::global_control(tbb::global_control::max_allowed_parallelism, /* max_concurrency */)
-  <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/scheduling_controls/global_control_cls.html>`_
+  <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/scheduling_controls/global_control_cls>`_
   limits the total number of oneTBB worker threads
 
 Examples
@@ -116,7 +116,7 @@ The limited parallelism:
 Setting thread stack size
 ---------------------------------------
 Use `oneapi::tbb::global_control(oneapi::tbb::global_control::thread_stack_size, /* stack_size */)
-<https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/scheduling_controls/global_control_cls.html>`_
+<https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/scheduling_controls/global_control_cls>`_
 to set the stack size for oneTBB worker threads:
 
 .. code:: cpp
@@ -141,7 +141,7 @@ to set the stack size for oneTBB worker threads:
 Terminating oneTBB scheduler
 ---------------------------------------
 
-`task_scheduler_handle <https://oneapi-src.github.io/oneAPI-spec/spec/elements/oneTBB/source/task_scheduler/scheduling_controls/task_scheduler_handle_cls.html>`_
+`task_scheduler_handle <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/scheduling_controls/task_scheduler_handle_cls>`_
 allows waiting for oneTBB worker threads completion:
 
 .. code:: cpp
diff --git a/examples/task_group/sudoku/README.md b/examples/task_group/sudoku/README.md
index 0e3ef499ea..9f59bbb446 100644
--- a/examples/task_group/sudoku/README.md
+++ b/examples/task_group/sudoku/README.md
@@ -1,4 +1,4 @@
-# Fractal sample
+# Sudoku sample
 This directory contains an example that finds all solutions to a Sudoku board.
 
 It uses a straightforward state-space search algorithm that exhibits OR-parallelism. It can be optionally run until it obtains just the first solution. The point of the example is to teach how to use the `task_group` interface.
@@ -11,9 +11,8 @@ cmake --build .
 
 ## Running the sample
 ### Predefined make targets
-* `make run_fractal` - executes the example with predefined parameters.
-* `make perf_run_fractal` - executes the example with suggested parameters to measure the oneTBB performance.
-* `make light_test_fractal` - executes the example with suggested parameters to reduce execution time.
+* `make run_sudoku` - executes the example with predefined parameters.
+* `make perf_run_sudoku` - executes the example with suggested parameters to measure the oneTBB performance.
 
 ### Application parameters
 Usage:
diff --git a/examples/test_all/fibonacci/README.md b/examples/test_all/fibonacci/README.md
index 3d1d795df8..f65edcece7 100644
--- a/examples/test_all/fibonacci/README.md
+++ b/examples/test_all/fibonacci/README.md
@@ -1,4 +1,4 @@
-# Fractal sample
+# Fibonacci sample
 This directory contains an example that computes Fibonacci numbers in several different ways.
 
 The purpose of the example is to exercise every include file and class in Intel® oneAPI Threading Building Blocks. Most of the computations are deliberately silly and not expected to show any speedup on multiprocessors.
@@ -11,9 +11,7 @@ cmake --build .
 
 ## Running the sample
 ### Predefined make targets
-* `make run_fractal` - executes the example with predefined parameters.
-* `make perf_run_fractal` - executes the example with suggested parameters to measure the oneTBB performance.
-* `make light_test_fractal` - executes the example with suggested parameters to reduce execution time.
+* `make run_fibonacci` - executes the example with predefined parameters.
 
 ### Application parameters
 Usage:
diff --git a/include/oneapi/tbb/collaborative_call_once.h b/include/oneapi/tbb/collaborative_call_once.h
index db082f891a..18e3bbb245 100644
--- a/include/oneapi/tbb/collaborative_call_once.h
+++ b/include/oneapi/tbb/collaborative_call_once.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2021 Intel Corporation
+    Copyright (c) 2021-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -32,6 +32,27 @@ namespace d1 {
     #pragma warning (disable: 4324)
 #endif
 
+template <typename F>
+class collaborative_call_stack_task : public task {
+    const F& m_func;
+    wait_context& m_wait_ctx;
+
+    void finalize() {
+        m_wait_ctx.release();
+    }
+    task* execute(d1::execution_data&) override {
+        task* res = d2::task_ptr_or_nullptr(m_func);
+        finalize();
+        return res;
+    }
+    task* cancel(d1::execution_data&) override {
+        finalize();
+        return nullptr;
+    }
+public:
+    collaborative_call_stack_task(const F& f, wait_context& wctx) : m_func(f), m_wait_ctx(wctx) {}
+};
+
 constexpr std::uintptr_t collaborative_once_max_references = max_nfs_size;
 constexpr std::uintptr_t collaborative_once_references_mask = collaborative_once_max_references-1;
 
@@ -103,7 +124,7 @@ class alignas(max_nfs_size) collaborative_once_runner : no_copy {
                 task_group_context context{ task_group_context::bound,
                     task_group_context::default_traits | task_group_context::concurrent_wait };
 
-                function_stack_task<F> t{ std::forward<F>(f), m_storage.m_wait_context };
+                collaborative_call_stack_task<F> t{ std::forward<F>(f), m_storage.m_wait_context };
 
                 // Set the ready flag after entering the execute body to prevent
                 // moonlighting threads from occupying all slots inside the arena.
@@ -151,7 +172,7 @@ class collaborative_once_flag : no_copy {
             spin_wait_until_eq(m_state, expected);
         } while (!m_state.compare_exchange_strong(expected, desired));
     }
-    
+
     template <typename Fn>
     void do_collaborative_call_once(Fn&& f) {
         std::uintptr_t expected = m_state.load(std::memory_order_acquire);
diff --git a/include/oneapi/tbb/concurrent_unordered_map.h b/include/oneapi/tbb/concurrent_unordered_map.h
index 336425cc8f..9cade0a94e 100644
--- a/include/oneapi/tbb/concurrent_unordered_map.h
+++ b/include/oneapi/tbb/concurrent_unordered_map.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -24,14 +24,14 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator, bool AllowMultimapping>
 struct concurrent_unordered_map_traits {
     using value_type = std::pair<const Key, T>;
     using key_type = Key;
     using allocator_type = Allocator;
-    using hash_compare_type = hash_compare<Key, Hash, KeyEqual>;
+    using hash_compare_type = d1::hash_compare<Key, Hash, KeyEqual>;
     static constexpr bool allow_multimapping = AllowMultimapping;
 
     static constexpr const key_type& get_key( const value_type& value ) {
@@ -399,13 +399,13 @@ void swap( concurrent_unordered_multimap<Key, T, Hash, KeyEqual, Allocator>& lhs
     lhs.swap(rhs);
 }
 
-} // namespace d1
+} // namespace d2
 } // namespace detail
 
 inline namespace v1 {
 
-using detail::d1::concurrent_unordered_map;
-using detail::d1::concurrent_unordered_multimap;
+using detail::d2::concurrent_unordered_map;
+using detail::d2::concurrent_unordered_multimap;
 using detail::split;
 
 } // inline namespace v1
diff --git a/include/oneapi/tbb/concurrent_unordered_set.h b/include/oneapi/tbb/concurrent_unordered_set.h
index c135b92222..b7e4b4cafc 100644
--- a/include/oneapi/tbb/concurrent_unordered_set.h
+++ b/include/oneapi/tbb/concurrent_unordered_set.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -23,14 +23,14 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 template <typename Key, typename Hash, typename KeyEqual, typename Allocator, bool AllowMultimapping>
 struct concurrent_unordered_set_traits {
     using key_type = Key;
     using value_type = key_type;
     using allocator_type = Allocator;
-    using hash_compare_type = hash_compare<key_type, Hash, KeyEqual>;
+    using hash_compare_type = d1::hash_compare<key_type, Hash, KeyEqual>;
     static constexpr bool allow_multimapping = AllowMultimapping;
 
     static constexpr const key_type& get_key( const value_type& value ) {
@@ -318,13 +318,13 @@ void swap( concurrent_unordered_multiset<Key, Hash, KeyEqual, Allocator>& lhs,
     lhs.swap(rhs);
 }
 
-} // namespace d1
+} // namespace d2
 } // namespace detail
 
 inline namespace v1 {
 
-using detail::d1::concurrent_unordered_set;
-using detail::d1::concurrent_unordered_multiset;
+using detail::d2::concurrent_unordered_set;
+using detail::d2::concurrent_unordered_multiset;
 using detail::split;
 
 } // inline namespace v1
diff --git a/include/oneapi/tbb/detail/_concurrent_unordered_base.h b/include/oneapi/tbb/detail/_concurrent_unordered_base.h
index 408292086a..85f54d0a57 100644
--- a/include/oneapi/tbb/detail/_concurrent_unordered_base.h
+++ b/include/oneapi/tbb/detail/_concurrent_unordered_base.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 template <typename Traits>
 class concurrent_unordered_base;
@@ -171,7 +171,7 @@ class value_node : public list_node<SokeyType>
     value_node( sokey_type ord_key ) : base_type(ord_key) {}
     ~value_node() {}
     value_type* storage() {
-        return reinterpret_cast<value_type*>(&my_value);
+        return &my_value;
     }
 
     value_type& value() {
@@ -179,8 +179,9 @@ class value_node : public list_node<SokeyType>
     }
 
 private:
-    using aligned_storage_type = typename std::aligned_storage<sizeof(value_type)>::type;
-    aligned_storage_type my_value;
+    union {
+        value_type my_value;
+    };
 }; // class value_node
 
 template <typename Traits>
@@ -237,7 +238,7 @@ class concurrent_unordered_base {
     template <typename T>
     using is_transparent = dependent_bool<has_transparent_key_equal<key_type, hasher, key_equal>, T>;
 public:
-    using node_type = node_handle<key_type, value_type, value_node_type, allocator_type>;
+    using node_type = d1::node_handle<key_type, value_type, value_node_type, allocator_type>;
 
     explicit concurrent_unordered_base( size_type bucket_count, const hasher& hash = hasher(),
                                         const key_equal& equal = key_equal(), const allocator_type& alloc = allocator_type() )
@@ -441,7 +442,7 @@ class concurrent_unordered_base {
 
     std::pair<iterator, bool> insert( node_type&& nh ) {
         if (!nh.empty()) {
-            value_node_ptr insert_node = node_handle_accessor::get_node_ptr(nh);
+            value_node_ptr insert_node = d1::node_handle_accessor::get_node_ptr(nh);
             auto init_node = [&insert_node]( sokey_type order_key )->value_node_ptr {
                 insert_node->init(order_key);
                 return insert_node;
@@ -451,7 +452,7 @@ class concurrent_unordered_base {
                 // If the insertion succeeded - set node handle to the empty state
                 __TBB_ASSERT(insert_result.remaining_node == nullptr,
                             "internal_insert_node should not return the remaining node if the insertion succeeded");
-                node_handle_accessor::deactivate(nh);
+                d1::node_handle_accessor::deactivate(nh);
             }
             return { iterator(insert_result.node_with_equal_key), insert_result.inserted };
         }
@@ -521,12 +522,12 @@ class concurrent_unordered_base {
 
     node_type unsafe_extract( const_iterator pos ) {
         internal_extract(pos.get_node_ptr());
-        return node_handle_accessor::construct<node_type>(pos.get_node_ptr());
+        return d1::node_handle_accessor::construct<node_type>(pos.get_node_ptr());
     }
 
     node_type unsafe_extract( iterator pos ) {
         internal_extract(pos.get_node_ptr());
-        return node_handle_accessor::construct<node_type>(pos.get_node_ptr());
+        return d1::node_handle_accessor::construct<node_type>(pos.get_node_ptr());
     }
 
     node_type unsafe_extract( const key_type& key ) {
@@ -787,11 +788,11 @@ class concurrent_unordered_base {
     static constexpr size_type pointers_per_embedded_table = sizeof(size_type) * 8 - 1;
 
     class unordered_segment_table
-        : public segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>
+        : public d1::segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>
     {
         using self_type = unordered_segment_table;
         using atomic_node_ptr = std::atomic<node_ptr>;
-        using base_type = segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>;
+        using base_type = d1::segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>;
         using segment_type = typename base_type::segment_type;
         using base_allocator_type = typename base_type::allocator_type;
 
@@ -921,7 +922,7 @@ class concurrent_unordered_base {
             node_allocator_traits::deallocate(dummy_node_allocator, node, 1);
         } else {
             // GCC 11.1 issues a warning here that incorrect destructor might be called for dummy_nodes
-            #if (__TBB_GCC_VERSION >= 110100 && __TBB_GCC_VERSION < 140000 ) && !__clang__ && !__INTEL_COMPILER
+            #if (__TBB_GCC_VERSION >= 110100 && __TBB_GCC_VERSION < 150000 ) && !__clang__ && !__INTEL_COMPILER
             volatile
             #endif
             value_node_ptr val_node = static_cast<value_node_ptr>(node);
@@ -1212,7 +1213,7 @@ class concurrent_unordered_base {
 
                     // Node handle with curr cannot be used directly in insert call, because
                     // the destructor of node_type will destroy curr
-                    node_type curr_node = node_handle_accessor::construct<node_type>(curr);
+                    node_type curr_node = d1::node_handle_accessor::construct<node_type>(curr);
 
                     // If the insertion fails - return ownership of the node to the source
                     if (!insert(std::move(curr_node)).second) {
@@ -1230,7 +1231,7 @@ class concurrent_unordered_base {
                         curr->set_next(next_node);
                         source_prev->set_next(curr);
                         source_prev = curr;
-                        node_handle_accessor::deactivate(curr_node);
+                        d1::node_handle_accessor::deactivate(curr_node);
                     } else {
                         source.my_size.fetch_sub(1, std::memory_order_relaxed);
                     }
@@ -1507,7 +1508,7 @@ bool operator!=( const concurrent_unordered_base<Traits>& lhs,
 #pragma warning(pop) // warning 4127 is back
 #endif
 
-} // namespace d1
+} // namespace d2
 } // namespace detail
 } // namespace tbb
 
diff --git a/include/oneapi/tbb/detail/_config.h b/include/oneapi/tbb/detail/_config.h
index d6705e154c..e676b1558b 100644
--- a/include/oneapi/tbb/detail/_config.h
+++ b/include/oneapi/tbb/detail/_config.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -335,7 +335,7 @@
 
 #define __TBB_TSX_INTRINSICS_PRESENT (__RTM__ || __INTEL_COMPILER || (_MSC_VER>=1700 && (__TBB_x86_64 || __TBB_x86_32)))
 
-#define __TBB_WAITPKG_INTRINSICS_PRESENT ((__INTEL_COMPILER >= 1900 || __TBB_GCC_VERSION >= 110000 || __TBB_CLANG_VERSION >= 120000) \
+#define __TBB_WAITPKG_INTRINSICS_PRESENT ((__INTEL_COMPILER >= 1900 || (__TBB_GCC_VERSION >= 110000 && __TBB_GNU_ASM_VERSION >= 2032) || __TBB_CLANG_VERSION >= 120000) \
                                          && (_WIN32 || _WIN64 || __unix__ || __APPLE__) && (__TBB_x86_32 || __TBB_x86_64) && !__ANDROID__)
 
 /** Internal TBB features & modes **/
@@ -521,6 +521,11 @@
 #define __TBB_PREVIEW_FLOW_GRAPH_NODE_SET       (TBB_PREVIEW_FLOW_GRAPH_FEATURES)
 #endif
 
+#ifndef __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+#define __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT (TBB_PREVIEW_FLOW_GRAPH_FEATURES \
+                                                   || TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT)
+#endif
+
 #if TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS
 #define __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS 1
 #endif
diff --git a/include/oneapi/tbb/detail/_flow_graph_body_impl.h b/include/oneapi/tbb/detail/_flow_graph_body_impl.h
index 8ac11211f6..21da06ce03 100644
--- a/include/oneapi/tbb/detail/_flow_graph_body_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_body_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// included in namespace tbb::detail::d1 (in flow_graph.h)
+// included in namespace tbb::detail::d2 (in flow_graph.h)
 
 typedef std::uint64_t tag_value;
 
@@ -53,7 +53,7 @@ namespace graph_policy_namespace {
     // K == type of field used for key-matching.  Each tag-matching port will be provided
     // functor that, given an object accepted by the port, will return the
     /// field of type K being used for matching.
-    template<typename K, typename KHash=tbb_hash_compare<typename std::decay<K>::type > >
+    template<typename K, typename KHash=d1::tbb_hash_compare<typename std::decay<K>::type > >
         __TBB_requires(tbb::detail::hash_compare<KHash, K>)
     struct key_matching {
         typedef K key_type;
@@ -77,7 +77,7 @@ template< typename Output >
 class input_body : no_assign {
 public:
     virtual ~input_body() {}
-    virtual Output operator()(flow_control& fc) = 0;
+    virtual Output operator()(d1::flow_control& fc) = 0;
     virtual input_body* clone() = 0;
 };
 
@@ -86,7 +86,7 @@ template< typename Output, typename Body>
 class input_body_leaf : public input_body<Output> {
 public:
     input_body_leaf( const Body &_body ) : body(_body) { }
-    Output operator()(flow_control& fc) override { return body(fc); }
+    Output operator()(d1::flow_control& fc) override { return body(fc); }
     input_body_leaf* clone() override {
         return new input_body_leaf< Output, Body >(body);
     }
@@ -249,12 +249,12 @@ template< typename NodeType >
 class forward_task_bypass : public graph_task {
     NodeType &my_node;
 public:
-    forward_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n
+    forward_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType &n
                          , node_priority_t node_priority = no_priority
     ) : graph_task(g, allocator, node_priority),
     my_node(n) {}
 
-    task* execute(execution_data& ed) override {
+    d1::task* execute(d1::execution_data& ed) override {
         graph_task* next_task = my_node.forward_task();
         if (SUCCESSFULLY_ENQUEUED == next_task)
             next_task = nullptr;
@@ -264,7 +264,7 @@ class forward_task_bypass : public graph_task {
         return next_task;
     }
 
-    task* cancel(execution_data& ed) override {
+    d1::task* cancel(d1::execution_data& ed) override {
         finalize<forward_task_bypass>(ed);
         return nullptr;
     }
@@ -272,29 +272,57 @@ class forward_task_bypass : public graph_task {
 
 //! A task that calls a node's apply_body_bypass function, passing in an input of type Input
 //  return the task* unless it is SUCCESSFULLY_ENQUEUED, in which case return nullptr
-template< typename NodeType, typename Input >
-class apply_body_task_bypass : public graph_task {
+template< typename NodeType, typename Input, typename BaseTaskType = graph_task>
+class apply_body_task_bypass
+    : public BaseTaskType
+{
     NodeType &my_node;
     Input my_input;
+
+    using check_metainfo = std::is_same<BaseTaskType, graph_task>;
+    using without_metainfo = std::true_type;
+    using with_metainfo = std::false_type;
+
+    graph_task* call_apply_body_bypass_impl(without_metainfo) {
+        return my_node.apply_body_bypass(my_input
+                                         __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* call_apply_body_bypass_impl(with_metainfo) {
+        return my_node.apply_body_bypass(my_input, message_metainfo{this->get_msg_wait_context_vertices()});
+    }
+#endif
+
+    graph_task* call_apply_body_bypass() {
+        return call_apply_body_bypass_impl(check_metainfo{});
+    }
+
 public:
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    template <typename Metainfo>
+    apply_body_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType &n, const Input &i,
+                            node_priority_t node_priority, Metainfo&& metainfo )
+        : BaseTaskType(g, allocator, node_priority, std::forward<Metainfo>(metainfo).waiters())
+        , my_node(n), my_input(i) {}
+#endif
 
-    apply_body_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n, const Input &i
-                            , node_priority_t node_priority = no_priority
-    ) : graph_task(g, allocator, node_priority),
-        my_node(n), my_input(i) {}
+    apply_body_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType& n, const Input& i,
+                            node_priority_t node_priority = no_priority )
+        : BaseTaskType(g, allocator, node_priority), my_node(n), my_input(i) {}
 
-    task* execute(execution_data& ed) override {
-        graph_task* next_task = my_node.apply_body_bypass( my_input );
+    d1::task* execute(d1::execution_data& ed) override {
+        graph_task* next_task = call_apply_body_bypass();
         if (SUCCESSFULLY_ENQUEUED == next_task)
             next_task = nullptr;
         else if (next_task)
             next_task = prioritize_task(my_node.graph_reference(), *next_task);
-        finalize<apply_body_task_bypass>(ed);
+        BaseTaskType::template finalize<apply_body_task_bypass>(ed);
         return next_task;
     }
 
-    task* cancel(execution_data& ed) override {
-        finalize<apply_body_task_bypass>(ed);
+    d1::task* cancel(d1::execution_data& ed) override {
+        BaseTaskType::template finalize<apply_body_task_bypass>(ed);
         return nullptr;
     }
 };
@@ -304,10 +332,10 @@ template< typename NodeType >
 class input_node_task_bypass : public graph_task {
     NodeType &my_node;
 public:
-    input_node_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n )
+    input_node_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType &n )
         : graph_task(g, allocator), my_node(n) {}
 
-    task* execute(execution_data& ed) override {
+    d1::task* execute(d1::execution_data& ed) override {
         graph_task* next_task = my_node.apply_body_bypass( );
         if (SUCCESSFULLY_ENQUEUED == next_task)
             next_task = nullptr;
@@ -317,7 +345,7 @@ class input_node_task_bypass : public graph_task {
         return next_task;
     }
 
-    task* cancel(execution_data& ed) override {
+    d1::task* cancel(d1::execution_data& ed) override {
         finalize<input_node_task_bypass>(ed);
         return nullptr;
     }
@@ -343,6 +371,15 @@ class threshold_regulator<T, DecrementType,
         return result;
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    // Intentionally ignore the metainformation
+    // If there are more items associated with passed metainfo to be processed
+    // They should be stored in the buffer before the limiter_node
+    graph_task* try_put_task(const DecrementType& value, const message_metainfo&) override {
+        return try_put_task(value);
+    }
+#endif
+
     graph& graph_reference() const override {
         return my_node->my_graph;
     }
@@ -361,7 +398,14 @@ class threshold_regulator<T, continue_msg, void> : public continue_receiver, no_
 
     T *my_node;
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    // Intentionally ignore the metainformation
+    // If there are more items associated with passed metainfo to be processed
+    // They should be stored in the buffer before the limiter_node
+    graph_task* execute(const message_metainfo&) override {
+#else
     graph_task* execute() override {
+#endif
         return my_node->decrement_counter( 1 );
     }
 
diff --git a/include/oneapi/tbb/detail/_flow_graph_cache_impl.h b/include/oneapi/tbb/detail/_flow_graph_cache_impl.h
index 059f198055..647f3dc1b6 100644
--- a/include/oneapi/tbb/detail/_flow_graph_cache_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_cache_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// included in namespace tbb::detail::d1 (in flow_graph.h)
+// included in namespace tbb::detail::d2 (in flow_graph.h)
 
 //! A node_cache maintains a std::queue of elements of type T.  Each operation is protected by a lock.
 template< typename T, typename M=spin_mutex >
@@ -98,9 +98,12 @@ class predecessor_cache : public node_cache< sender<T>, M > {
         // Do not work with the passed pointer here as it may not be fully initialized yet
     }
 
-    bool get_item( output_type& v ) {
+private:
+    bool get_item_impl( output_type& v
+                        __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo* metainfo_ptr = nullptr) )
+    {
 
-        bool msg = false;
+        bool successful_get = false;
 
         do {
             predecessor_type *src;
@@ -113,18 +116,35 @@ class predecessor_cache : public node_cache< sender<T>, M > {
             }
 
             // Try to get from this sender
-            msg = src->try_get( v );
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            if (metainfo_ptr) {
+                successful_get = src->try_get( v, *metainfo_ptr );
+            } else
+#endif
+            {
+                successful_get = src->try_get( v );
+            }
 
-            if (msg == false) {
+            if (successful_get == false) {
                 // Relinquish ownership of the edge
                 register_successor(*src, *my_owner);
             } else {
                 // Retain ownership of the edge
                 this->add(*src);
             }
-        } while ( msg == false );
-        return msg;
+        } while ( successful_get == false );
+        return successful_get;
     }
+public:
+    bool get_item( output_type& v ) {
+        return get_item_impl(v);
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool get_item( output_type& v, message_metainfo& metainfo ) {
+        return get_item_impl(v, &metainfo);
+    }
+#endif
 
     // If we are removing arcs (rf_clear_edges), call clear() rather than reset().
     void reset() {
@@ -157,8 +177,9 @@ class reservable_predecessor_cache : public predecessor_cache< T, M > {
         // Do not work with the passed pointer here as it may not be fully initialized yet
     }
 
-    bool try_reserve( output_type &v ) {
-        bool msg = false;
+private:
+    bool try_reserve_impl( output_type &v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo* metainfo) ) {
+        bool successful_reserve = false;
 
         do {
             predecessor_type* pred = nullptr;
@@ -172,9 +193,16 @@ class reservable_predecessor_cache : public predecessor_cache< T, M > {
             }
 
             // Try to get from this sender
-            msg = pred->try_reserve( v );
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            if (metainfo) {
+                successful_reserve = pred->try_reserve( v, *metainfo );
+            } else
+#endif
+            {
+                successful_reserve = pred->try_reserve( v );
+            }
 
-            if (msg == false) {
+            if (successful_reserve == false) {
                 typename mutex_type::scoped_lock lock(this->my_mutex);
                 // Relinquish ownership of the edge
                 register_successor( *pred, *this->my_owner );
@@ -183,11 +211,21 @@ class reservable_predecessor_cache : public predecessor_cache< T, M > {
                 // Retain ownership of the edge
                 this->add( *pred);
             }
-        } while ( msg == false );
+        } while ( successful_reserve == false );
 
-        return msg;
+        return successful_reserve;
+    }
+public:
+    bool try_reserve( output_type& v ) {
+        return try_reserve_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(nullptr));
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool try_reserve( output_type& v, message_metainfo& metainfo ) {
+        return try_reserve_impl(v, &metainfo);
+    }
+#endif
+
     bool try_release() {
         reserved_src.load(std::memory_order_relaxed)->try_release();
         reserved_src.store(nullptr, std::memory_order_relaxed);
@@ -268,6 +306,9 @@ class successor_cache : no_copy {
     }
 
     virtual graph_task* try_put_task( const T& t ) = 0;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual graph_task* try_put_task( const T& t, const message_metainfo& metainfo ) = 0;
+#endif
 };  // successor_cache<T>
 
 //! An abstract cache of successors, specialized to continue_msg
@@ -327,6 +368,9 @@ class successor_cache< continue_msg, M > : no_copy {
     }
 
     virtual graph_task* try_put_task( const continue_msg& t ) = 0;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual graph_task* try_put_task( const continue_msg& t, const message_metainfo& metainfo ) = 0;
+#endif
 };  // successor_cache< continue_msg >
 
 //! A cache of successors that are broadcast to
@@ -336,19 +380,12 @@ class broadcast_cache : public successor_cache<T, M> {
     typedef M mutex_type;
     typedef typename successor_cache<T,M>::successors_type successors_type;
 
-public:
-
-    broadcast_cache( typename base_type::owner_type* owner ): base_type(owner) {
-        // Do not work with the passed pointer here as it may not be fully initialized yet
-    }
-
-    // as above, but call try_put_task instead, and return the last task we received (if any)
-    graph_task* try_put_task( const T &t ) override {
+    graph_task* try_put_task_impl( const T& t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) {
         graph_task * last_task = nullptr;
         typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
         typename successors_type::iterator i = this->my_successors.begin();
         while ( i != this->my_successors.end() ) {
-            graph_task *new_task = (*i)->try_put_task(t);
+            graph_task *new_task = (*i)->try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             // workaround for icc bug
             graph& graph_ref = (*i)->graph_reference();
             last_task = combine_tasks(graph_ref, last_task, new_task);  // enqueue if necessary
@@ -365,6 +402,21 @@ class broadcast_cache : public successor_cache<T, M> {
         }
         return last_task;
     }
+public:
+
+    broadcast_cache( typename base_type::owner_type* owner ): base_type(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    graph_task* try_put_task( const T &t ) override {
+        return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task( const T &t, const message_metainfo& metainfo ) override {
+        return try_put_task_impl(t, metainfo);
+    }
+#endif
 
     // call try_put_task and return list of received tasks
     bool gather_successful_try_puts( const T &t, graph_task_list& tasks ) {
@@ -411,11 +463,15 @@ class round_robin_cache : public successor_cache<T, M> {
         return this->my_successors.size();
     }
 
-    graph_task* try_put_task( const T &t ) override {
+private:
+
+    graph_task* try_put_task_impl( const T &t
+                                   __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) )
+    {
         typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
         typename successors_type::iterator i = this->my_successors.begin();
         while ( i != this->my_successors.end() ) {
-            graph_task* new_task = (*i)->try_put_task(t);
+            graph_task* new_task = (*i)->try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             if ( new_task ) {
                 return new_task;
             } else {
@@ -429,6 +485,17 @@ class round_robin_cache : public successor_cache<T, M> {
         }
         return nullptr;
     }
+
+public:
+    graph_task* try_put_task(const T& t) override {
+        return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task( const T& t, const message_metainfo& metainfo ) override {
+        return try_put_task_impl(t, metainfo);
+    }
+#endif
 };
 
 #endif // __TBB__flow_graph_cache_impl_H
diff --git a/include/oneapi/tbb/detail/_flow_graph_impl.h b/include/oneapi/tbb/detail/_flow_graph_impl.h
index 8207667f37..19e00a8ef1 100644
--- a/include/oneapi/tbb/detail/_flow_graph_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@
 namespace tbb {
 namespace detail {
 
-namespace d1 {
+namespace d2 {
 
 class graph_task;
 static graph_task* const SUCCESSFULLY_ENQUEUED = (graph_task*)-1;
@@ -123,32 +123,98 @@ void enqueue_in_graph_arena(graph &g, graph_task& arena_task);
 class graph;
 
 //! Base class for tasks generated by graph nodes.
-class graph_task : public task {
+class graph_task : public d1::task {
 public:
-    graph_task(graph& g, small_object_allocator& allocator
-               , node_priority_t node_priority = no_priority
-    )
-        : my_graph(g)
-        , priority(node_priority)
-        , my_allocator(allocator)
-    {}
+    graph_task(graph& g, d1::small_object_allocator& allocator,
+               node_priority_t node_priority = no_priority);
+
     graph& my_graph; // graph instance the task belongs to
     // TODO revamp: rename to my_priority
     node_priority_t priority;
     template <typename DerivedType>
-    void destruct_and_deallocate(const execution_data& ed);
+    void destruct_and_deallocate(const d1::execution_data& ed);
 protected:
     template <typename DerivedType>
-    void finalize(const execution_data& ed);
+    void finalize(const d1::execution_data& ed);
 private:
     // To organize task_list
     graph_task* my_next{ nullptr };
-    small_object_allocator my_allocator;
+    d1::small_object_allocator my_allocator;
+    d1::wait_tree_vertex_interface* my_reference_vertex;
     // TODO revamp: elaborate internal interfaces to avoid friends declarations
     friend class graph_task_list;
     friend graph_task* prioritize_task(graph& g, graph_task& gt);
 };
 
+inline bool is_this_thread_in_graph_arena(graph& g);
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+class trackable_messages_graph_task : public graph_task {
+public:
+    trackable_messages_graph_task(graph& g, d1::small_object_allocator& allocator,
+                                  node_priority_t node_priority,
+                                  const std::forward_list<d1::wait_context_vertex*>& msg_waiters)
+        : graph_task(g, allocator, node_priority)
+        , my_msg_wait_context_vertices(msg_waiters)
+    {
+        auto last_iterator = my_msg_reference_vertices.cbefore_begin();
+
+        for (auto& msg_waiter : my_msg_wait_context_vertices) {
+            // If the task is created by the thread outside the graph arena, the lifetime of the thread reference vertex
+            // may be shorter that the lifetime of the task, so thread reference vertex approach cannot be used
+            // and the task should be associated with the msg wait context itself
+            d1::wait_tree_vertex_interface* ref_vertex = is_this_thread_in_graph_arena(g) ?
+                                                         r1::get_thread_reference_vertex(msg_waiter) :
+                                                         msg_waiter;
+            last_iterator = my_msg_reference_vertices.emplace_after(last_iterator,
+                                                                    ref_vertex);
+            ref_vertex->reserve(1);
+        }
+    }
+
+    trackable_messages_graph_task(graph& g, d1::small_object_allocator& allocator,
+                                  node_priority_t node_priority,
+                                  std::forward_list<d1::wait_context_vertex*>&& msg_waiters)
+        : graph_task(g, allocator, node_priority)
+        , my_msg_wait_context_vertices(std::move(msg_waiters))
+    {
+    }
+
+    const std::forward_list<d1::wait_context_vertex*> get_msg_wait_context_vertices() const {
+        return my_msg_wait_context_vertices;
+    }
+
+protected:
+    template <typename DerivedType>
+    void finalize(const d1::execution_data& ed) {
+        auto wait_context_vertices = std::move(my_msg_wait_context_vertices);
+        auto msg_reference_vertices = std::move(my_msg_reference_vertices);
+        graph_task::finalize<DerivedType>(ed);
+
+        // If there is no thread reference vertices associated with the task
+        // then this task was created by transferring the ownership from other metainfo
+        // instance (e.g. while taking from the buffer)
+        if (msg_reference_vertices.empty()) {
+            for (auto& msg_waiter : wait_context_vertices) {
+                msg_waiter->release(1);
+            }
+        } else {
+            for (auto& msg_waiter : msg_reference_vertices) {
+                msg_waiter->release(1);
+            }
+        }
+    }
+private:
+    // Each task that holds information about single message wait_contexts should hold two lists
+    // The first one is wait_contexts associated with the message itself. They are needed
+    // to be able to broadcast the list of wait_contexts to the node successors while executing the task.
+    // The second list is a list of reference vertices for each wait_context_vertex in the first list
+    // to support the distributed reference counting schema
+    std::forward_list<d1::wait_context_vertex*> my_msg_wait_context_vertices;
+    std::forward_list<d1::wait_tree_vertex_interface*> my_msg_reference_vertices;
+}; // class trackable_messages_graph_task
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 struct graph_task_comparator {
     bool operator()(const graph_task* left, const graph_task* right) {
         return left->priority < right->priority;
@@ -157,18 +223,18 @@ struct graph_task_comparator {
 
 typedef tbb::concurrent_priority_queue<graph_task*, graph_task_comparator> graph_task_priority_queue_t;
 
-class priority_task_selector : public task {
+class priority_task_selector : public d1::task {
 public:
-    priority_task_selector(graph_task_priority_queue_t& priority_queue, small_object_allocator& allocator)
+    priority_task_selector(graph_task_priority_queue_t& priority_queue, d1::small_object_allocator& allocator)
         : my_priority_queue(priority_queue), my_allocator(allocator), my_task() {}
-    task* execute(execution_data& ed) override {
+    task* execute(d1::execution_data& ed) override {
         next_task();
         __TBB_ASSERT(my_task, nullptr);
         task* t_next = my_task->execute(ed);
         my_allocator.delete_object(this, ed);
         return t_next;
     }
-    task* cancel(execution_data& ed) override {
+    task* cancel(d1::execution_data& ed) override {
         if (!my_task) {
             next_task();
         }
@@ -190,7 +256,7 @@ class priority_task_selector : public task {
     }
 
     graph_task_priority_queue_t& my_priority_queue;
-    small_object_allocator my_allocator;
+    d1::small_object_allocator my_allocator;
     graph_task* my_task;
 };
 
@@ -281,7 +347,7 @@ class graph : no_copy, public graph_proxy {
         caught_exception = false;
         try_call([this] {
             my_task_arena->execute([this] {
-                wait(my_wait_context, *my_context);
+                wait(my_wait_context_vertex.get_context(), *my_context);
             });
             cancelled = my_context->is_group_execution_cancelled();
         }).on_exception([this] {
@@ -332,7 +398,7 @@ class graph : no_copy, public graph_proxy {
     bool exception_thrown() { return caught_exception; }
 
 private:
-    wait_context my_wait_context;
+    d1::wait_context_vertex my_wait_context_vertex;
     task_group_context *my_context;
     bool own_context;
     bool cancelled;
@@ -349,19 +415,25 @@ class graph : no_copy, public graph_proxy {
 
     graph_task_priority_queue_t my_priority_queue;
 
+    d1::wait_context_vertex& get_wait_context_vertex() { return my_wait_context_vertex; }
+
     friend void activate_graph(graph& g);
     friend void deactivate_graph(graph& g);
     friend bool is_graph_active(graph& g);
+    friend bool is_this_thread_in_graph_arena(graph& g);
     friend graph_task* prioritize_task(graph& g, graph_task& arena_task);
     friend void spawn_in_graph_arena(graph& g, graph_task& arena_task);
     friend void enqueue_in_graph_arena(graph &g, graph_task& arena_task);
 
-    friend class task_arena_base;
+    friend class d1::task_arena_base;
+    friend class graph_task;
 
+    template <typename T>
+    friend class receiver;
 };  // class graph
 
 template<typename DerivedType>
-inline void graph_task::destruct_and_deallocate(const execution_data& ed) {
+inline void graph_task::destruct_and_deallocate(const d1::execution_data& ed) {
     auto allocator = my_allocator;
     // TODO: investigate if direct call of derived destructor gives any benefits.
     this->~graph_task();
@@ -369,10 +441,27 @@ inline void graph_task::destruct_and_deallocate(const execution_data& ed) {
 }
 
 template<typename DerivedType>
-inline void graph_task::finalize(const execution_data& ed) {
-    graph& g = my_graph;
+inline void graph_task::finalize(const d1::execution_data& ed) {
+    d1::wait_tree_vertex_interface* reference_vertex = my_reference_vertex;
     destruct_and_deallocate<DerivedType>(ed);
-    g.release_wait();
+    reference_vertex->release();
+}
+
+inline graph_task::graph_task(graph& g, d1::small_object_allocator& allocator,
+                              node_priority_t node_priority)
+    : my_graph(g)
+    , priority(node_priority)
+    , my_allocator(allocator)
+{
+    // If the task is created by the thread outside the graph arena, the lifetime of the thread reference vertex
+    // may be shorter that the lifetime of the task, so thread reference vertex approach cannot be used
+    // and the task should be associated with the graph wait context itself
+    // TODO: consider how reference counting can be improved for such a use case. Most common example is the async_node
+    d1::wait_context_vertex* graph_wait_context_vertex = &my_graph.get_wait_context_vertex();
+    my_reference_vertex = is_this_thread_in_graph_arena(g) ? r1::get_thread_reference_vertex(graph_wait_context_vertex)
+                                                           : graph_wait_context_vertex;
+    __TBB_ASSERT(my_reference_vertex, nullptr);
+    my_reference_vertex->reserve();
 }
 
 //********************************************************************************
@@ -424,15 +513,20 @@ inline bool is_graph_active(graph& g) {
     return g.my_is_active;
 }
 
+inline bool is_this_thread_in_graph_arena(graph& g) {
+    __TBB_ASSERT(g.my_task_arena && g.my_task_arena->is_active(), nullptr);
+    return r1::execution_slot(*g.my_task_arena) != d1::slot_id(-1);
+}
+
 inline graph_task* prioritize_task(graph& g, graph_task& gt) {
     if( no_priority == gt.priority )
         return &gt;
 
     //! Non-preemptive priority pattern. The original task is submitted as a work item to the
     //! priority queue, and a new critical task is created to take and execute a work item with
-    //! the highest known priority. The reference counting responsibility is transferred (via
-    //! allocate_continuation) to the new task.
-    task* critical_task = gt.my_allocator.new_object<priority_task_selector>(g.my_priority_queue, gt.my_allocator);
+    //! the highest known priority. The reference counting responsibility is transferred to
+    //! the new task.
+    d1::task* critical_task = gt.my_allocator.new_object<priority_task_selector>(g.my_priority_queue, gt.my_allocator);
     __TBB_ASSERT( critical_task, "bad_alloc?" );
     g.my_priority_queue.push(&gt);
     using tbb::detail::d1::submit;
@@ -443,7 +537,7 @@ inline graph_task* prioritize_task(graph& g, graph_task& gt) {
 //! Spawns a task inside graph arena
 inline void spawn_in_graph_arena(graph& g, graph_task& arena_task) {
     if (is_graph_active(g)) {
-        task* gt = prioritize_task(g, arena_task);
+        d1::task* gt = prioritize_task(g, arena_task);
         if( !gt )
             return;
 
@@ -464,12 +558,12 @@ inline void enqueue_in_graph_arena(graph &g, graph_task& arena_task) {
         __TBB_ASSERT( g.my_task_arena && g.my_task_arena->is_active(), "Is graph's arena initialized and active?" );
 
         // TODO revamp: decide on the approach that does not postpone critical task
-        if( task* gt = prioritize_task(g, arena_task) )
+        if( d1::task* gt = prioritize_task(g, arena_task) )
             submit( *gt, *g.my_task_arena, *g.my_context, /*as_critical=*/false);
     }
 }
 
-} // namespace d1
+} // namespace d2
 } // namespace detail
 } // namespace tbb
 
diff --git a/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h b/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h
index f4f55a6c7a..a743310079 100644
--- a/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// included in namespace tbb::detail::d1
+// included in namespace tbb::detail::d2
 
 #include "_flow_graph_types_impl.h"
 
@@ -31,9 +31,9 @@
     // successor.
 
     template<typename IndexerNodeBaseType, typename T, size_t K>
-    graph_task* do_try_put(const T &v, void *p) {
+    graph_task* do_try_put(const T &v, void *p __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
         typename IndexerNodeBaseType::output_type o(K, v);
-        return reinterpret_cast<IndexerNodeBaseType *>(p)->try_put_task(&o);
+        return reinterpret_cast<IndexerNodeBaseType *>(p)->try_put_task(&o __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
     }
 
     template<typename TupleTypes,int N>
@@ -41,7 +41,7 @@
         template<typename IndexerNodeBaseType, typename PortTuple>
         static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) {
             typedef typename std::tuple_element<N-1, TupleTypes>::type T;
-            graph_task* (*indexer_node_put_task)(const T&, void *) = do_try_put<IndexerNodeBaseType, T, N-1>;
+            auto indexer_node_put_task = do_try_put<IndexerNodeBaseType, T, N-1>;
             std::get<N-1>(my_input).set_up(p, indexer_node_put_task, g);
             indexer_helper<TupleTypes,N-1>::template set_indexer_node_pointer<IndexerNodeBaseType,PortTuple>(my_input, p, g);
         }
@@ -52,7 +52,7 @@
         template<typename IndexerNodeBaseType, typename PortTuple>
         static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) {
             typedef typename std::tuple_element<0, TupleTypes>::type T;
-            graph_task* (*indexer_node_put_task)(const T&, void *) = do_try_put<IndexerNodeBaseType, T, 0>;
+            auto indexer_node_put_task = do_try_put<IndexerNodeBaseType, T, 0>;
             std::get<0>(my_input).set_up(p, indexer_node_put_task, g);
         }
     };
@@ -61,7 +61,8 @@
     class indexer_input_port : public receiver<T> {
     private:
         void* my_indexer_ptr;
-        typedef graph_task* (* forward_function_ptr)(T const &, void* );
+        typedef graph_task* (* forward_function_ptr)(T const &, void*
+                                                     __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo&));
         forward_function_ptr my_try_put_task;
         graph* my_graph;
     public:
@@ -76,9 +77,15 @@
         template<typename X, typename Y> friend class broadcast_cache;
         template<typename X, typename Y> friend class round_robin_cache;
         graph_task* try_put_task(const T &v) override {
-            return my_try_put_task(v, my_indexer_ptr);
+            return my_try_put_task(v, my_indexer_ptr __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        graph_task* try_put_task(const T& v, const message_metainfo& metainfo) override {
+            return my_try_put_task(v, my_indexer_ptr, metainfo);
+        }
+#endif
+
         graph& graph_reference() const override {
             return *my_graph;
         }
@@ -118,7 +125,7 @@
         };
         typedef indexer_node_base<InputTuple,output_type,StructTypes> class_type;
 
-        class indexer_node_base_operation : public aggregated_operation<indexer_node_base_operation> {
+        class indexer_node_base_operation : public d1::aggregated_operation<indexer_node_base_operation> {
         public:
             char type;
             union {
@@ -126,15 +133,23 @@
                 successor_type *my_succ;
                 graph_task* bypass_t;
             };
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo const* metainfo;
+#endif
             indexer_node_base_operation(const output_type* e, op_type t) :
-                type(char(t)), my_arg(e) {}
+                type(char(t)), my_arg(e) __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(nullptr))
+            {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            indexer_node_base_operation(const output_type* e, op_type t, const message_metainfo& info)
+                : type(char(t)), my_arg(e), metainfo(&info) {}
+#endif
             indexer_node_base_operation(const successor_type &s, op_type t) : type(char(t)),
                 my_succ(const_cast<successor_type *>(&s)) {}
         };
 
-        typedef aggregating_functor<class_type, indexer_node_base_operation> handler_type;
-        friend class aggregating_functor<class_type, indexer_node_base_operation>;
-        aggregator<handler_type, indexer_node_base_operation> my_aggregator;
+        typedef d1::aggregating_functor<class_type, indexer_node_base_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, indexer_node_base_operation>;
+        d1::aggregator<handler_type, indexer_node_base_operation> my_aggregator;
 
         void handle_operations(indexer_node_base_operation* op_list) {
             indexer_node_base_operation *current;
@@ -153,7 +168,8 @@
                     current->status.store( SUCCEEDED, std::memory_order_release);
                     break;
                 case try__put_task: {
-                        current->bypass_t = my_successors.try_put_task(*(current->my_arg));
+                        current->bypass_t = my_successors.try_put_task(*(current->my_arg)
+                                                                       __TBB_FLOW_GRAPH_METAINFO_ARG(*(current->metainfo)));
                         current->status.store( SUCCEEDED, std::memory_order_release);  // return of try_put_task actual return value
                     }
                     break;
@@ -186,8 +202,11 @@
             return op_data.status == SUCCEEDED;
         }
 
-        graph_task* try_put_task(output_type const *v) { // not a virtual method in this class
-            indexer_node_base_operation op_data(v, try__put_task);
+        // not a virtual method in this class
+        graph_task* try_put_task(output_type const *v
+                                 __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+        {
+            indexer_node_base_operation op_data(v, try__put_task __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             my_aggregator.execute(&op_data);
             return op_data.bypass_t;
         }
diff --git a/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h b/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h
index 423033b1d5..cf7c54b852 100644
--- a/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -37,8 +37,14 @@ class item_buffer {
     typedef T item_type;
     enum buffer_item_state { no_item=0, has_item=1, reserved_item=2 };
 protected:
+    struct aligned_space_item {
+        item_type item;
+        buffer_item_state state;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo metainfo;
+#endif
+    };
     typedef size_t size_type;
-    typedef std::pair<item_type, buffer_item_state> aligned_space_item;
     typedef aligned_space<aligned_space_item> buffer_item_type;
     typedef typename allocator_traits<A>::template rebind_alloc<buffer_item_type> allocator_type;
     buffer_item_type *my_array;
@@ -49,45 +55,89 @@ class item_buffer {
 
     bool buffer_empty() const { return my_head == my_tail; }
 
-    aligned_space_item &item(size_type i) {
-        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->second))%alignment_of<buffer_item_state>::value), nullptr);
-        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->first))%alignment_of<item_type>::value), nullptr);
+    aligned_space_item &element(size_type i) {
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->state))%alignment_of<buffer_item_state>::value), nullptr);
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->item))%alignment_of<item_type>::value), nullptr);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->metainfo))%alignment_of<message_metainfo>::value), nullptr);
+#endif
         return *my_array[i & (my_array_size - 1) ].begin();
     }
 
-    const aligned_space_item &item(size_type i) const {
-        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->second))%alignment_of<buffer_item_state>::value), nullptr);
-        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->first))%alignment_of<item_type>::value), nullptr);
+    const aligned_space_item &element(size_type i) const {
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->state))%alignment_of<buffer_item_state>::value), nullptr);
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->item))%alignment_of<item_type>::value), nullptr);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->metainfo))%alignment_of<message_metainfo>::value), nullptr);
+#endif
         return *my_array[i & (my_array_size-1)].begin();
     }
 
-    bool my_item_valid(size_type i) const { return (i < my_tail) && (i >= my_head) && (item(i).second != no_item); }
+    bool my_item_valid(size_type i) const { return (i < my_tail) && (i >= my_head) && (element(i).state != no_item); }
 #if TBB_USE_ASSERT
-    bool my_item_reserved(size_type i) const { return item(i).second == reserved_item; }
+    bool my_item_reserved(size_type i) const { return element(i).state == reserved_item; }
 #endif
 
     // object management in buffer
     const item_type &get_my_item(size_t i) const {
         __TBB_ASSERT(my_item_valid(i),"attempt to get invalid item");
-        item_type* itm = const_cast<item_type*>(reinterpret_cast<const item_type*>(&item(i).first));
-        return *itm;
+        return element(i).item;
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    message_metainfo& get_my_metainfo(size_t i) {
+        __TBB_ASSERT(my_item_valid(i), "attempt to get invalid item");
+        return element(i).metainfo;
+    }
+#endif
+
     // may be called with an empty slot or a slot that has already been constructed into.
-    void set_my_item(size_t i, const item_type &o) {
-        if(item(i).second != no_item) {
+    void set_my_item(size_t i, const item_type &o
+                     __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
+        if(element(i).state != no_item) {
             destroy_item(i);
         }
-        new(&(item(i).first)) item_type(o);
-        item(i).second = has_item;
+        new(&(element(i).item)) item_type(o);
+        element(i).state = has_item;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        new(&element(i).metainfo) message_metainfo(metainfo);
+
+        for (auto& waiter : metainfo.waiters()) {
+            waiter->reserve(1);
+        }
+#endif
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    void set_my_item(size_t i, const item_type& o, message_metainfo&& metainfo) {
+        if(element(i).state != no_item) {
+            destroy_item(i);
+        }
+
+        new(&(element(i).item)) item_type(o);
+        new(&element(i).metainfo) message_metainfo(std::move(metainfo));
+        // Skipping the reservation on metainfo.waiters since the ownership
+        // is moving from metainfo to the cache
+        element(i).state = has_item;
     }
+#endif
 
     // destructively-fetch an object from the buffer
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    void fetch_item(size_t i, item_type& o, message_metainfo& metainfo) {
+        __TBB_ASSERT(my_item_valid(i), "Trying to fetch an empty slot");
+        o = get_my_item(i);  // could have std::move assign semantics
+        metainfo = std::move(get_my_metainfo(i));
+        destroy_item(i);
+    }
+#else
     void fetch_item(size_t i, item_type &o) {
         __TBB_ASSERT(my_item_valid(i), "Trying to fetch an empty slot");
         o = get_my_item(i);  // could have std::move assign semantics
         destroy_item(i);
     }
+#endif
 
     // move an existing item from one slot to another.  The moved-to slot must be unoccupied,
     // the moved-from slot must exist and not be reserved.  The after, from will be empty,
@@ -95,12 +145,22 @@ class item_buffer {
     void move_item(size_t to, size_t from) {
         __TBB_ASSERT(!my_item_valid(to), "Trying to move to a non-empty slot");
         __TBB_ASSERT(my_item_valid(from), "Trying to move from an empty slot");
-        set_my_item(to, get_my_item(from));   // could have std::move semantics
+        // could have std::move semantics
+        set_my_item(to, get_my_item(from) __TBB_FLOW_GRAPH_METAINFO_ARG(get_my_metainfo(from)));
         destroy_item(from);
-
     }
 
     // put an item in an empty slot.  Return true if successful, else false
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    template <typename Metainfo>
+    bool place_item(size_t here, const item_type &me, Metainfo&& metainfo) {
+#if !TBB_DEPRECATED_SEQUENCER_DUPLICATES
+        if(my_item_valid(here)) return false;
+#endif
+        set_my_item(here, me, std::forward<Metainfo>(metainfo));
+        return true;
+    }
+#else
     bool place_item(size_t here, const item_type &me) {
 #if !TBB_DEPRECATED_SEQUENCER_DUPLICATES
         if(my_item_valid(here)) return false;
@@ -108,19 +168,36 @@ class item_buffer {
         set_my_item(here, me);
         return true;
     }
+#endif
 
     // could be implemented with std::move semantics
     void swap_items(size_t i, size_t j) {
         __TBB_ASSERT(my_item_valid(i) && my_item_valid(j), "attempt to swap invalid item(s)");
         item_type temp = get_my_item(i);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo temp_metainfo = get_my_metainfo(i);
+        set_my_item(i, get_my_item(j), get_my_metainfo(j));
+        set_my_item(j, temp, temp_metainfo);
+#else
         set_my_item(i, get_my_item(j));
         set_my_item(j, temp);
+#endif
     }
 
     void destroy_item(size_type i) {
         __TBB_ASSERT(my_item_valid(i), "destruction of invalid item");
-        item(i).first.~item_type();
-        item(i).second = no_item;
+
+        auto& e = element(i);
+        e.item.~item_type();
+        e.state = no_item;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        for (auto& msg_waiter : e.metainfo.waiters()) {
+            msg_waiter->release(1);
+        }
+
+        e.metainfo.~message_metainfo();
+#endif
     }
 
     // returns the front element
@@ -130,6 +207,14 @@ class item_buffer {
         return get_my_item(my_head);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    const message_metainfo& front_metainfo() const
+    {
+        __TBB_ASSERT(my_item_valid(my_head), "attempt to fetch head non-item");
+        return element(my_head).metainfo;
+    }
+#endif
+
     // returns  the back element
     const item_type& back() const
     {
@@ -137,9 +222,23 @@ class item_buffer {
         return get_my_item(my_tail - 1);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    const message_metainfo& back_metainfo() const {
+        __TBB_ASSERT(my_item_valid(my_tail - 1), "attempt to fetch head non-item");
+        return element(my_tail - 1).metainfo;
+    }
+#endif
+
     // following methods are for reservation of the front of a buffer.
-    void reserve_item(size_type i) { __TBB_ASSERT(my_item_valid(i) && !my_item_reserved(i), "item cannot be reserved"); item(i).second = reserved_item; }
-    void release_item(size_type i) { __TBB_ASSERT(my_item_reserved(i), "item is not reserved"); item(i).second = has_item; }
+    void reserve_item(size_type i) {
+        __TBB_ASSERT(my_item_valid(i) && !my_item_reserved(i), "item cannot be reserved");
+        element(i).state = reserved_item;
+    }
+
+    void release_item(size_type i) {
+        __TBB_ASSERT(my_item_reserved(i), "item is not reserved");
+        element(i).state = has_item;
+    }
 
     void destroy_front() { destroy_item(my_head); ++my_head; }
     void destroy_back() { destroy_item(my_tail-1); --my_tail; }
@@ -163,14 +262,18 @@ class item_buffer {
         buffer_item_type* new_array = allocator_type().allocate(new_size);
 
         // initialize validity to "no"
-        for( size_type i=0; i<new_size; ++i ) { new_array[i].begin()->second = no_item; }
+        for( size_type i=0; i<new_size; ++i ) { new_array[i].begin()->state = no_item; }
 
         for( size_type i=my_head; i<my_tail; ++i) {
             if(my_item_valid(i)) {  // sequencer_node may have empty slots
                 // placement-new copy-construct; could be std::move
-                char *new_space = (char *)&(new_array[i&(new_size-1)].begin()->first);
+                char *new_space = (char *)&(new_array[i&(new_size-1)].begin()->item);
                 (void)new(new_space) item_type(get_my_item(i));
-                new_array[i&(new_size-1)].begin()->second = item(i).second;
+                new_array[i&(new_size-1)].begin()->state = element(i).state;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                char* meta_space = (char *)&(new_array[i&(new_size-1)].begin()->metainfo);
+                ::new(meta_space) message_metainfo(std::move(element(i).metainfo));
+#endif
             }
         }
 
@@ -180,33 +283,61 @@ class item_buffer {
         my_array_size = new_size;
     }
 
-    bool push_back(item_type &v) {
-        if(buffer_full()) {
+    bool push_back(item_type& v
+                   __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
+        if (buffer_full()) {
             grow_my_array(size() + 1);
         }
-        set_my_item(my_tail, v);
+        set_my_item(my_tail, v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         ++my_tail;
         return true;
     }
 
-    bool pop_back(item_type &v) {
-        if (!my_item_valid(my_tail-1)) {
+    bool pop_back(item_type& v
+                  __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& metainfo))
+    {
+        if (!my_item_valid(my_tail - 1)) {
             return false;
         }
-        v = this->back();
+        auto& e = element(my_tail - 1);
+        v = e.item;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        metainfo = std::move(e.metainfo);
+#endif
+
         destroy_back();
         return true;
     }
 
-    bool pop_front(item_type &v) {
-        if(!my_item_valid(my_head)) {
+    bool pop_front(item_type& v
+                   __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& metainfo))
+    {
+        if (!my_item_valid(my_head)) {
             return false;
         }
-        v = this->front();
+        auto& e = element(my_head);
+        v = e.item;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        metainfo = std::move(e.metainfo);
+#endif
+
         destroy_front();
         return true;
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool pop_back(item_type& v) {
+        message_metainfo metainfo;
+        return pop_back(v, metainfo);
+    }
+
+    bool pop_front(item_type& v) {
+        message_metainfo metainfo;
+        return pop_front(v, metainfo);
+    }
+#endif
+
     // This is used both for reset and for grow_my_array.  In the case of grow_my_array
     // we want to retain the values of the head and tail.
     void clean_up_buffer(bool reset_pointers) {
@@ -261,6 +392,18 @@ class reservable_item_buffer : public item_buffer<T, A> {
         return true;
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool reserve_front(T& v, message_metainfo& metainfo) {
+        if (my_reserved || !my_item_valid(this->my_head)) return false;
+        my_reserved = true;
+        // reserving the head
+        v = this->front();
+        metainfo = this->front_metainfo();
+        this->reserve_item(this->my_head);
+        return true;
+    }
+#endif
+
     void consume_front() {
         __TBB_ASSERT(my_reserved, "Attempt to consume a non-reserved item");
         this->destroy_front();
diff --git a/include/oneapi/tbb/detail/_flow_graph_join_impl.h b/include/oneapi/tbb/detail/_flow_graph_join_impl.h
index 5515421ede..8bca9a2c41 100644
--- a/include/oneapi/tbb/detail/_flow_graph_join_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_join_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// included into namespace tbb::detail::d1
+// included into namespace tbb::detail::d2
 
     struct forwarding_base : no_assign {
         forwarding_base(graph &g) : graph_ref(g) {}
@@ -89,17 +89,49 @@
             return true;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool reserve(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            message_metainfo element_metainfo;
+            if (!std::get<N - 1>(my_input).reserve(std::get<N - 1>(out), element_metainfo)) return false;
+            if (!join_helper<N - 1>::reserve(my_input, out, metainfo)) {
+                release_my_reservation(my_input);
+                return false;
+            }
+            metainfo.merge(element_metainfo);
+            return true;
+
+        }
+#endif
+
         template<typename InputTuple, typename OutputTuple>
         static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) {
             bool res = std::get<N-1>(my_input).get_item(std::get<N-1>(out) ); // may fail
             return join_helper<N-1>::get_my_item(my_input, out) && res;       // do get on other inputs before returning
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool get_my_item(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            message_metainfo element_metainfo;
+            bool res = std::get<N-1>(my_input).get_item(std::get<N-1>(out), element_metainfo);
+            metainfo.merge(element_metainfo);
+            return join_helper<N-1>::get_my_item(my_input, out, metainfo) && res;
+        }
+#endif
+
         template<typename InputTuple, typename OutputTuple>
         static inline bool get_items(InputTuple &my_input, OutputTuple &out) {
             return get_my_item(my_input, out);
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool get_items(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            return get_my_item(my_input, out, metainfo);
+        }
+#endif
+
         template<typename InputTuple>
         static inline void reset_my_port(InputTuple &my_input) {
             join_helper<N-1>::reset_my_port(my_input);
@@ -163,16 +195,43 @@
             return std::get<0>( my_input ).reserve( std::get<0>( out ) );
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool reserve(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            message_metainfo element_metainfo;
+            bool result = std::get<0>(my_input).reserve(std::get<0>(out), element_metainfo);
+            metainfo.merge(element_metainfo);
+            return result;
+        }
+#endif
+
         template<typename InputTuple, typename OutputTuple>
         static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) {
             return std::get<0>(my_input).get_item(std::get<0>(out));
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool get_my_item(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            message_metainfo element_metainfo;
+            bool res = std::get<0>(my_input).get_item(std::get<0>(out), element_metainfo);
+            metainfo.merge(element_metainfo);
+            return res;
+        }
+#endif
+
         template<typename InputTuple, typename OutputTuple>
         static inline bool get_items(InputTuple &my_input, OutputTuple &out) {
             return get_my_item(my_input, out);
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool get_items(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            return get_my_item(my_input, out, metainfo);
+        }
+#endif
+
         template<typename InputTuple>
         static inline void reset_my_port(InputTuple &my_input) {
             std::get<0>(my_input).reset_port();
@@ -216,23 +275,31 @@
         };
         typedef reserving_port<T> class_type;
 
-        class reserving_port_operation : public aggregated_operation<reserving_port_operation> {
+        class reserving_port_operation : public d1::aggregated_operation<reserving_port_operation> {
         public:
             char type;
             union {
                 T *my_arg;
                 predecessor_type *my_pred;
             };
-            reserving_port_operation(const T& e, op_type t) :
-                type(char(t)), my_arg(const_cast<T*>(&e)) {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo* metainfo;
+#endif
+            reserving_port_operation(const T& e, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info)) :
+                type(char(t)), my_arg(const_cast<T*>(&e))
+                __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info)) {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            reserving_port_operation(const T& e, op_type t)
+                : type(char(t)), my_arg(const_cast<T*>(&e)), metainfo(nullptr) {}
+#endif
             reserving_port_operation(const predecessor_type &s, op_type t) : type(char(t)),
                 my_pred(const_cast<predecessor_type *>(&s)) {}
             reserving_port_operation(op_type t) : type(char(t)) {}
         };
 
-        typedef aggregating_functor<class_type, reserving_port_operation> handler_type;
-        friend class aggregating_functor<class_type, reserving_port_operation>;
-        aggregator<handler_type, reserving_port_operation> my_aggregator;
+        typedef d1::aggregating_functor<class_type, reserving_port_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, reserving_port_operation>;
+        d1::aggregator<handler_type, reserving_port_operation> my_aggregator;
 
         void handle_operations(reserving_port_operation* op_list) {
             reserving_port_operation *current;
@@ -262,14 +329,26 @@
                     if ( reserved ) {
                         current->status.store( FAILED, std::memory_order_release);
                     }
-                    else if ( my_predecessors.try_reserve( *(current->my_arg) ) ) {
-                        reserved = true;
-                        current->status.store( SUCCEEDED, std::memory_order_release);
-                    } else {
-                        if ( my_predecessors.empty() ) {
-                            my_join->increment_port_count();
+                    else {
+                        bool reserve_result = false;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        if (current->metainfo) {
+                            reserve_result = my_predecessors.try_reserve(*(current->my_arg),
+                                                                         *(current->metainfo));
+                        } else
+#endif
+                        {
+                            reserve_result = my_predecessors.try_reserve(*(current->my_arg));
+                        }
+                        if (reserve_result) {
+                            reserved = true;
+                            current->status.store( SUCCEEDED, std::memory_order_release);
+                        } else {
+                            if ( my_predecessors.empty() ) {
+                                my_join->increment_port_count();
+                            }
+                            current->status.store( FAILED, std::memory_order_release);
                         }
-                        current->status.store( FAILED, std::memory_order_release);
                     }
                     break;
                 case rel_res:
@@ -294,6 +373,10 @@
             return nullptr;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const T&, const message_metainfo&) override { return nullptr; }
+#endif
+
         graph& graph_reference() const override {
             return my_join->graph_ref;
         }
@@ -333,6 +416,14 @@
             return op_data.status == SUCCEEDED;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool reserve( T& v, message_metainfo& metainfo ) {
+            reserving_port_operation op_data(v, res_item, metainfo);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+#endif
+
         //! Release the port
         void release( ) {
             reserving_port_operation op_data(rel_res);
@@ -376,31 +467,42 @@
         enum op_type { get__item, res_port, try__put_task
         };
 
-        class queueing_port_operation : public aggregated_operation<queueing_port_operation> {
+        class queueing_port_operation : public d1::aggregated_operation<queueing_port_operation> {
         public:
             char type;
             T my_val;
             T* my_arg;
             graph_task* bypass_t;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo* metainfo;
+#endif
             // constructor for value parameter
-            queueing_port_operation(const T& e, op_type t) :
-                type(char(t)), my_val(e), my_arg(nullptr)
+            queueing_port_operation(const T& e, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& info))
+                : type(char(t)), my_val(e), my_arg(nullptr)
                 , bypass_t(nullptr)
+                __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(const_cast<message_metainfo*>(&info)))
             {}
             // constructor for pointer parameter
-            queueing_port_operation(const T* p, op_type t) :
+            queueing_port_operation(const T* p, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info)) :
                 type(char(t)), my_arg(const_cast<T*>(p))
                 , bypass_t(nullptr)
+                __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info))
+            {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            queueing_port_operation(const T* p, op_type t)
+                : type(char(t)), my_arg(const_cast<T*>(p)), bypass_t(nullptr), metainfo(nullptr)
             {}
+#endif
             // constructor with no parameter
             queueing_port_operation(op_type t) : type(char(t)), my_arg(nullptr)
                 , bypass_t(nullptr)
+                __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(nullptr))
             {}
         };
 
-        typedef aggregating_functor<class_type, queueing_port_operation> handler_type;
-        friend class aggregating_functor<class_type, queueing_port_operation>;
-        aggregator<handler_type, queueing_port_operation> my_aggregator;
+        typedef d1::aggregating_functor<class_type, queueing_port_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, queueing_port_operation>;
+        d1::aggregator<handler_type, queueing_port_operation> my_aggregator;
 
         void handle_operations(queueing_port_operation* op_list) {
             queueing_port_operation *current;
@@ -412,7 +514,12 @@
                 case try__put_task: {
                         graph_task* rtask = nullptr;
                         was_empty = this->buffer_empty();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        __TBB_ASSERT(current->metainfo, nullptr);
+                        this->push_back(current->my_val, *(current->metainfo));
+#else
                         this->push_back(current->my_val);
+#endif
                         if (was_empty) rtask = my_join->decrement_port_count(false);
                         else
                             rtask = SUCCESSFULLY_ENQUEUED;
@@ -424,6 +531,11 @@
                     if(!this->buffer_empty()) {
                         __TBB_ASSERT(current->my_arg, nullptr);
                         *(current->my_arg) = this->front();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        if (current->metainfo) {
+                            *(current->metainfo) = this->front_metainfo();
+                        }
+#endif
                         current->status.store( SUCCEEDED, std::memory_order_release);
                     }
                     else {
@@ -447,14 +559,27 @@
         template< typename R, typename B > friend class run_and_put_task;
         template<typename X, typename Y> friend class broadcast_cache;
         template<typename X, typename Y> friend class round_robin_cache;
-        graph_task* try_put_task(const T &v) override {
-            queueing_port_operation op_data(v, try__put_task);
+
+    private:
+        graph_task* try_put_task_impl(const T& v __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
+            queueing_port_operation op_data(v, try__put_task __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             my_aggregator.execute(&op_data);
             __TBB_ASSERT(op_data.status == SUCCEEDED || !op_data.bypass_t, "inconsistent return from aggregator");
             if(!op_data.bypass_t) return SUCCESSFULLY_ENQUEUED;
             return op_data.bypass_t;
         }
 
+    protected:
+        graph_task* try_put_task(const T &v) override {
+            return try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+        }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        graph_task* try_put_task(const T& v, const message_metainfo& metainfo) override {
+            return try_put_task_impl(v, metainfo);
+        }
+#endif
+
         graph& graph_reference() const override {
             return my_join->graph_ref;
         }
@@ -481,6 +606,14 @@
             return op_data.status == SUCCEEDED;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool get_item( T& v, message_metainfo& metainfo ) {
+            queueing_port_operation op_data(&v, get__item, metainfo);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+#endif
+
         // reset_port is called when item is accepted by successor, but
         // is initiated by join_node.
         void reset_port() {
@@ -517,13 +650,23 @@
         const K& operator()(const table_item_type& v) { return v.my_key; }
     };
 
+    template <typename K, typename T, typename TtoK, typename KHash>
+    struct key_matching_port_base {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        using type = metainfo_hash_buffer<K, T, TtoK, KHash>;
+#else
+        using type = hash_buffer<K, T, TtoK, KHash>;
+#endif
+    };
+
     // the ports can have only one template parameter.  We wrap the types needed in
     // a traits type
     template< class TraitsType >
     class key_matching_port :
         public receiver<typename TraitsType::T>,
-        public hash_buffer< typename TraitsType::K, typename TraitsType::T, typename TraitsType::TtoK,
-                typename TraitsType::KHash > {
+        public key_matching_port_base< typename TraitsType::K, typename TraitsType::T, typename TraitsType::TtoK,
+                                       typename TraitsType::KHash >::type
+    {
     public:
         typedef TraitsType traits;
         typedef key_matching_port<traits> class_type;
@@ -533,7 +676,7 @@
         typedef typename receiver<input_type>::predecessor_type predecessor_type;
         typedef typename TraitsType::TtoK type_to_key_func_type;
         typedef typename TraitsType::KHash hash_compare_type;
-        typedef hash_buffer< key_type, input_type, type_to_key_func_type, hash_compare_type > buffer_type;
+        typedef typename key_matching_port_base<key_type, input_type, type_to_key_func_type, hash_compare_type>::type buffer_type;
 
     private:
 // ----------- Aggregator ------------
@@ -541,24 +684,33 @@
         enum op_type { try__put, get__item, res_port
         };
 
-        class key_matching_port_operation : public aggregated_operation<key_matching_port_operation> {
+        class key_matching_port_operation : public d1::aggregated_operation<key_matching_port_operation> {
         public:
             char type;
             input_type my_val;
             input_type *my_arg;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo* metainfo = nullptr;
+#endif
             // constructor for value parameter
-            key_matching_port_operation(const input_type& e, op_type t) :
-                type(char(t)), my_val(e), my_arg(nullptr) {}
+            key_matching_port_operation(const input_type& e, op_type t
+                                        __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& info))
+                : type(char(t)), my_val(e), my_arg(nullptr)
+                  __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(const_cast<message_metainfo*>(&info))) {}
+
             // constructor for pointer parameter
-            key_matching_port_operation(const input_type* p, op_type t) :
-                type(char(t)), my_arg(const_cast<input_type*>(p)) {}
+            key_matching_port_operation(const input_type* p, op_type t
+                                        __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info))
+                : type(char(t)), my_arg(const_cast<input_type*>(p))
+                  __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info)) {}
+
             // constructor with no parameter
             key_matching_port_operation(op_type t) : type(char(t)), my_arg(nullptr) {}
         };
 
-        typedef aggregating_functor<class_type, key_matching_port_operation> handler_type;
-        friend class aggregating_functor<class_type, key_matching_port_operation>;
-        aggregator<handler_type, key_matching_port_operation> my_aggregator;
+        typedef d1::aggregating_functor<class_type, key_matching_port_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, key_matching_port_operation>;
+        d1::aggregator<handler_type, key_matching_port_operation> my_aggregator;
 
         void handle_operations(key_matching_port_operation* op_list) {
             key_matching_port_operation *current;
@@ -567,18 +719,35 @@
                 op_list = op_list->next;
                 switch(current->type) {
                 case try__put: {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        __TBB_ASSERT(current->metainfo, nullptr);
+                        bool was_inserted = this->insert_with_key(current->my_val, *(current->metainfo));
+#else
                         bool was_inserted = this->insert_with_key(current->my_val);
+#endif
                         // return failure if a duplicate insertion occurs
                         current->status.store( was_inserted ? SUCCEEDED : FAILED, std::memory_order_release);
                     }
                     break;
-                case get__item:
+                case get__item: {
                     // use current_key from FE for item
                     __TBB_ASSERT(current->my_arg, nullptr);
-                    if(!this->find_with_key(my_join->current_key, *(current->my_arg))) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                    __TBB_ASSERT(current->metainfo, nullptr);
+                    bool find_result = this->find_with_key(my_join->current_key, *(current->my_arg),
+                                                           *(current->metainfo));
+#else
+                    bool find_result = this->find_with_key(my_join->current_key, *(current->my_arg));
+#endif
+#if TBB_USE_DEBUG
+                    if (!find_result) {
                         __TBB_ASSERT(false, "Failed to find item corresponding to current_key.");
                     }
+#else
+                    tbb::detail::suppress_unused_warning(find_result);
+#endif
                     current->status.store( SUCCEEDED, std::memory_order_release);
+                    }
                     break;
                 case res_port:
                     // use current_key from FE for item
@@ -593,17 +762,28 @@
         template< typename R, typename B > friend class run_and_put_task;
         template<typename X, typename Y> friend class broadcast_cache;
         template<typename X, typename Y> friend class round_robin_cache;
-        graph_task* try_put_task(const input_type& v) override {
-            key_matching_port_operation op_data(v, try__put);
+    private:
+        graph_task* try_put_task_impl(const input_type& v __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
+            key_matching_port_operation op_data(v, try__put __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             graph_task* rtask = nullptr;
             my_aggregator.execute(&op_data);
             if(op_data.status == SUCCEEDED) {
-                rtask = my_join->increment_key_count((*(this->get_key_func()))(v));  // may spawn
+                rtask = my_join->increment_key_count((*(this->get_key_func()))(v)); // may spawn
                 // rtask has to reflect the return status of the try_put
                 if(!rtask) rtask = SUCCESSFULLY_ENQUEUED;
             }
             return rtask;
         }
+    protected:
+        graph_task* try_put_task(const input_type& v) override {
+            return try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+        }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        graph_task* try_put_task(const input_type& v, const message_metainfo& metainfo) override {
+            return try_put_task_impl(v, metainfo);
+        }
+#endif
 
         graph& graph_reference() const override {
             return my_join->graph_ref;
@@ -640,6 +820,15 @@
             return op_data.status == SUCCEEDED;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool get_item( input_type& v, message_metainfo& metainfo ) {
+            // aggregator uses current_key from FE for Key
+            key_matching_port_operation op_data(&v, get__item, metainfo);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+#endif
+
         // reset_port is called when item is accepted by successor, but
         // is initiated by join_node.
         void reset_port() {
@@ -695,10 +884,9 @@
         graph_task* decrement_port_count() override {
             if(ports_with_no_inputs.fetch_sub(1) == 1) {
                 if(is_graph_active(this->graph_ref)) {
-                    small_object_allocator allocator{};
+                    d1::small_object_allocator allocator{};
                     typedef forward_task_bypass<base_node_type> task_type;
                     graph_task* t = allocator.new_object<task_type>(graph_ref, allocator, *my_node);
-                    graph_ref.reserve_wait();
                     spawn_in_graph_arena(this->graph_ref, *t);
                 }
             }
@@ -726,6 +914,13 @@
             return join_helper<N>::reserve(my_inputs, out);
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool try_to_make_tuple(output_type &out, message_metainfo& metainfo) {
+            if (ports_with_no_inputs) return false;
+            return join_helper<N>::reserve(my_inputs, out, metainfo);
+        }
+#endif
+
         void tuple_accepted() {
             join_helper<N>::consume_reservations(my_inputs);
         }
@@ -768,10 +963,9 @@
         {
             if(ports_with_no_items.fetch_sub(1) == 1) {
                 if(is_graph_active(this->graph_ref)) {
-                    small_object_allocator allocator{};
+                    d1::small_object_allocator allocator{};
                     typedef forward_task_bypass<base_node_type> task_type;
                     graph_task* t = allocator.new_object<task_type>(graph_ref, allocator, *my_node);
-                    graph_ref.reserve_wait();
                     if( !handle_task )
                         return t;
                     spawn_in_graph_arena(this->graph_ref, *t);
@@ -800,6 +994,13 @@
             return join_helper<N>::get_items(my_inputs, out);
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool try_to_make_tuple(output_type &out, message_metainfo& metainfo) {
+            if(ports_with_no_items) return false;
+            return join_helper<N>::get_items(my_inputs, out, metainfo);
+        }
+#endif
+
         void tuple_accepted() {
             reset_port_count();
             join_helper<N>::reset_ports(my_inputs);
@@ -854,23 +1055,30 @@
         enum op_type { res_count, inc_count, may_succeed, try_make };
         typedef join_node_FE<key_matching<key_type,key_hash_compare>, InputTuple, OutputTuple> class_type;
 
-        class key_matching_FE_operation : public aggregated_operation<key_matching_FE_operation> {
+        class key_matching_FE_operation : public d1::aggregated_operation<key_matching_FE_operation> {
         public:
             char type;
             unref_key_type my_val;
             output_type* my_output;
             graph_task* bypass_t;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo* metainfo = nullptr;
+#endif
             // constructor for value parameter
             key_matching_FE_operation(const unref_key_type& e , op_type t) : type(char(t)), my_val(e),
                  my_output(nullptr), bypass_t(nullptr) {}
             key_matching_FE_operation(output_type *p, op_type t) : type(char(t)), my_output(p), bypass_t(nullptr) {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            key_matching_FE_operation(output_type *p, op_type t, message_metainfo& info)
+                : type(char(t)), my_output(p), bypass_t(nullptr), metainfo(&info) {}
+#endif
             // constructor with no parameter
             key_matching_FE_operation(op_type t) : type(char(t)), my_output(nullptr), bypass_t(nullptr) {}
         };
 
-        typedef aggregating_functor<class_type, key_matching_FE_operation> handler_type;
-        friend class aggregating_functor<class_type, key_matching_FE_operation>;
-        aggregator<handler_type, key_matching_FE_operation> my_aggregator;
+        typedef d1::aggregating_functor<class_type, key_matching_FE_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, key_matching_FE_operation>;
+        d1::aggregator<handler_type, key_matching_FE_operation> my_aggregator;
 
         // called from aggregator, so serialized
         // returns a task pointer if the a task would have been enqueued but we asked that
@@ -881,13 +1089,15 @@
             bool do_fwd = this->buffer_empty() && is_graph_active(this->graph_ref);
             this->current_key = t;
             this->delete_with_key(this->current_key);   // remove the key
-            if(join_helper<N>::get_items(my_inputs, l_out)) {  //  <== call back
-                this->push_back(l_out);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo metainfo;
+#endif
+            if(join_helper<N>::get_items(my_inputs, l_out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) {  //  <== call back
+                this->push_back(l_out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
                 if(do_fwd) {  // we enqueue if receiving an item from predecessor, not if successor asks for item
-                    small_object_allocator allocator{};
+                    d1::small_object_allocator allocator{};
                     typedef forward_task_bypass<base_node_type> task_type;
                     rtask = allocator.new_object<task_type>(this->graph_ref, allocator, *my_node);
-                    this->graph_ref.reserve_wait();
                     do_fwd = false;
                 }
                 // retire the input values
@@ -937,6 +1147,11 @@
                     }
                     else {
                         *(current->my_output) = this->front();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        if (current->metainfo) {
+                            *(current->metainfo) = this->front_metainfo();
+                        }
+#endif
                         current->status.store( SUCCEEDED, std::memory_order_release);
                     }
                     break;
@@ -1010,6 +1225,14 @@
             return op_data.status == SUCCEEDED;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool try_to_make_tuple(output_type &out, message_metainfo& metainfo) {
+            key_matching_FE_operation op_data(&out, try_make, metainfo);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+#endif
+
         void tuple_accepted() {
             reset_port_count();  // reset current_key after ports reset.
         }
@@ -1044,7 +1267,7 @@
         };
         typedef join_node_base<JP,InputTuple,OutputTuple> class_type;
 
-        class join_node_base_operation : public aggregated_operation<join_node_base_operation> {
+        class join_node_base_operation : public d1::aggregated_operation<join_node_base_operation> {
         public:
             char type;
             union {
@@ -1052,17 +1275,25 @@
                 successor_type *my_succ;
             };
             graph_task* bypass_t;
-            join_node_base_operation(const output_type& e, op_type t) : type(char(t)),
-                my_arg(const_cast<output_type*>(&e)), bypass_t(nullptr) {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo* metainfo;
+#endif
+            join_node_base_operation(const output_type& e, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info))
+                : type(char(t)), my_arg(const_cast<output_type*>(&e)), bypass_t(nullptr)
+                  __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info)) {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            join_node_base_operation(const output_type& e, op_type t)
+                : type(char(t)), my_arg(const_cast<output_type*>(&e)), bypass_t(nullptr), metainfo(nullptr) {}
+#endif
             join_node_base_operation(const successor_type &s, op_type t) : type(char(t)),
                 my_succ(const_cast<successor_type *>(&s)), bypass_t(nullptr) {}
             join_node_base_operation(op_type t) : type(char(t)), bypass_t(nullptr) {}
         };
 
-        typedef aggregating_functor<class_type, join_node_base_operation> handler_type;
-        friend class aggregating_functor<class_type, join_node_base_operation>;
+        typedef d1::aggregating_functor<class_type, join_node_base_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, join_node_base_operation>;
         bool forwarder_busy;
-        aggregator<handler_type, join_node_base_operation> my_aggregator;
+        d1::aggregator<handler_type, join_node_base_operation> my_aggregator;
 
         void handle_operations(join_node_base_operation* op_list) {
             join_node_base_operation *current;
@@ -1073,10 +1304,9 @@
                 case reg_succ: {
                         my_successors.register_successor(*(current->my_succ));
                         if(tuple_build_may_succeed() && !forwarder_busy && is_graph_active(my_graph)) {
-                            small_object_allocator allocator{};
+                            d1::small_object_allocator allocator{};
                             typedef forward_task_bypass< join_node_base<JP, InputTuple, OutputTuple> > task_type;
                             graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
-                            my_graph.reserve_wait();
                             spawn_in_graph_arena(my_graph, *t);
                             forwarder_busy = true;
                         }
@@ -1089,7 +1319,26 @@
                     break;
                 case try__get:
                     if(tuple_build_may_succeed()) {
-                        if(try_to_make_tuple(*(current->my_arg))) {
+                        bool make_tuple_result = false;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        if (current->metainfo) {
+                            make_tuple_result = try_to_make_tuple(*(current->my_arg), *(current->metainfo));
+                        } else
+#endif
+                        {
+                            make_tuple_result = try_to_make_tuple(*(current->my_arg));
+                        }
+                        if(make_tuple_result) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                            if (current->metainfo) {
+                                // Since elements would be removed from queues while calling to tuple_accepted
+                                // together with corresponding message_metainfo objects
+                                // we need to prolong the wait until the successor would create a task for removed elements
+                                for (auto waiter : current->metainfo->waiters()) {
+                                    waiter->reserve(1);
+                                }
+                            }
+#endif
                             tuple_accepted();
                             current->status.store( SUCCEEDED, std::memory_order_release);
                         }
@@ -1110,9 +1359,14 @@
                         // them from the input ports after forwarding is complete?
                         if(tuple_build_may_succeed()) {  // checks output queue of FE
                             do {
-                                build_succeeded = try_to_make_tuple(out);  // fetch front_end of queue
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                                message_metainfo metainfo;
+#endif
+                                // fetch front_end of queue
+                                build_succeeded = try_to_make_tuple(out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
                                 if(build_succeeded) {
-                                    graph_task *new_task = my_successors.try_put_task(out);
+                                    graph_task *new_task =
+                                        my_successors.try_put_task(out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
                                     last_task = combine_tasks(my_graph, last_task, new_task);
                                     if(new_task) {
                                         tuple_accepted();
@@ -1175,6 +1429,14 @@
             return op_data.status == SUCCEEDED;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool try_get( output_type &v, message_metainfo& metainfo) override {
+            join_node_base_operation op_data(v, try__get, metainfo);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+#endif
+
     protected:
         void reset_node(reset_flags f) override {
             input_ports_type::reset(f);
diff --git a/include/oneapi/tbb/detail/_flow_graph_node_impl.h b/include/oneapi/tbb/detail/_flow_graph_node_impl.h
index b79c53ddbf..336cb069c6 100644
--- a/include/oneapi/tbb/detail/_flow_graph_node_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_node_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -34,6 +34,12 @@ class function_input_queue : public item_buffer<T,A> {
         return this->item_buffer<T, A>::front();
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    const message_metainfo& front_metainfo() const {
+        return this->item_buffer<T,A>::front_metainfo();
+    }
+#endif
+
     void pop() {
         this->destroy_front();
     }
@@ -41,6 +47,12 @@ class function_input_queue : public item_buffer<T,A> {
     bool push( T& t ) {
         return this->push_back( t );
     }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool push( T& t, const message_metainfo& metainfo ) {
+        return this->push_back(t, metainfo);
+    }
+#endif
 };
 
 //! Input and scheduling for a function node that takes a type Input as input
@@ -87,11 +99,14 @@ class function_input_base : public receiver<Input>, no_assign {
     }
 
     graph_task* try_put_task( const input_type& t) override {
-        if ( my_is_no_throw )
-            return try_put_task_impl(t, has_policy<lightweight, Policy>());
-        else
-            return try_put_task_impl(t, std::false_type());
+        return try_put_task_base(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task( const input_type& t, const message_metainfo& metainfo ) override {
+        return try_put_task_base(t, metainfo);
     }
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
 
     //! Adds src to the list of cached predecessors.
     bool register_predecessor( predecessor_type &src ) override {
@@ -148,9 +163,12 @@ class function_input_base : public receiver<Input>, no_assign {
 private:
 
     friend class apply_body_task_bypass< class_type, input_type >;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    friend class apply_body_task_bypass< class_type, input_type, trackable_messages_graph_task >;
+#endif
     friend class forward_task_bypass< class_type >;
 
-    class operation_type : public aggregated_operation< operation_type > {
+    class operation_type : public d1::aggregated_operation< operation_type > {
     public:
         char type;
         union {
@@ -158,31 +176,49 @@ class function_input_base : public receiver<Input>, no_assign {
             predecessor_type *r;
         };
         graph_task* bypass_t;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo* metainfo;
+#endif
         operation_type(const input_type& e, op_type t) :
-            type(char(t)), elem(const_cast<input_type*>(&e)), bypass_t(nullptr) {}
+            type(char(t)), elem(const_cast<input_type*>(&e)), bypass_t(nullptr)
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            , metainfo(nullptr)
+#endif
+        {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        operation_type(const input_type& e, op_type t, const message_metainfo& info) :
+            type(char(t)), elem(const_cast<input_type*>(&e)), bypass_t(nullptr),
+            metainfo(const_cast<message_metainfo*>(&info)) {}
+#endif
         operation_type(op_type t) : type(char(t)), r(nullptr), bypass_t(nullptr) {}
     };
 
     bool forwarder_busy;
-    typedef aggregating_functor<class_type, operation_type> handler_type;
-    friend class aggregating_functor<class_type, operation_type>;
-    aggregator< handler_type, operation_type > my_aggregator;
+    typedef d1::aggregating_functor<class_type, operation_type> handler_type;
+    friend class d1::aggregating_functor<class_type, operation_type>;
+    d1::aggregator< handler_type, operation_type > my_aggregator;
 
     graph_task* perform_queued_requests() {
         graph_task* new_task = nullptr;
         if(my_queue) {
             if(!my_queue->empty()) {
                 ++my_concurrency;
-                new_task = create_body_task(my_queue->front());
+                // TODO: consider removing metainfo from the queue using move semantics to avoid
+                // ref counter increase
+                new_task = create_body_task(my_queue->front()
+                                            __TBB_FLOW_GRAPH_METAINFO_ARG(my_queue->front_metainfo()));
 
                 my_queue->pop();
             }
         }
         else {
             input_type i;
-            if(my_predecessors.get_item(i)) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo metainfo;
+#endif
+            if(my_predecessors.get_item(i __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) {
                 ++my_concurrency;
-                new_task = create_body_task(i);
+                new_task = create_body_task(i __TBB_FLOW_GRAPH_METAINFO_ARG(std::move(metainfo)));
             }
         }
         return new_task;
@@ -233,10 +269,13 @@ class function_input_base : public receiver<Input>, no_assign {
         __TBB_ASSERT(my_max_concurrency != 0, nullptr);
         if (my_concurrency < my_max_concurrency) {
             ++my_concurrency;
-            graph_task * new_task = create_body_task(*(op->elem));
+            graph_task* new_task = create_body_task(*(op->elem)
+                                                    __TBB_FLOW_GRAPH_METAINFO_ARG(*(op->metainfo)));
             op->bypass_t = new_task;
             op->status.store(SUCCEEDED, std::memory_order_release);
-        } else if ( my_queue && my_queue->push(*(op->elem)) ) {
+        } else if ( my_queue && my_queue->push(*(op->elem)
+                    __TBB_FLOW_GRAPH_METAINFO_ARG(*(op->metainfo))) )
+        {
             op->bypass_t = SUCCESSFULLY_ENQUEUED;
             op->status.store(SUCCEEDED, std::memory_order_release);
         } else {
@@ -258,8 +297,10 @@ class function_input_base : public receiver<Input>, no_assign {
         }
     }
 
-    graph_task* internal_try_put_bypass( const input_type& t ) {
-        operation_type op_data(t, tryput_bypass);
+    graph_task* internal_try_put_bypass( const input_type& t
+                                         __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
+        operation_type op_data(t, tryput_bypass __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         my_aggregator.execute(&op_data);
         if( op_data.status == SUCCEEDED ) {
             return op_data.bypass_t;
@@ -267,43 +308,75 @@ class function_input_base : public receiver<Input>, no_assign {
         return nullptr;
     }
 
-    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::true_type ) {
+    graph_task* try_put_task_base(const input_type& t
+                                  __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
+        if ( my_is_no_throw )
+            return try_put_task_impl(t, has_policy<lightweight, Policy>()
+                                     __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
+        else
+            return try_put_task_impl(t, std::false_type()
+                                     __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
+    }
+
+    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::true_type
+                                   __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
         if( my_max_concurrency == 0 ) {
-            return apply_body_bypass(t);
+            return apply_body_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         } else {
             operation_type check_op(t, occupy_concurrency);
             my_aggregator.execute(&check_op);
             if( check_op.status == SUCCEEDED ) {
-                return apply_body_bypass(t);
+                return apply_body_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             }
-            return internal_try_put_bypass(t);
+            return internal_try_put_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         }
     }
 
-    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::false_type ) {
+    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::false_type
+                                   __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
         if( my_max_concurrency == 0 ) {
-            return create_body_task(t);
+            return create_body_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         } else {
-            return internal_try_put_bypass(t);
+            return internal_try_put_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         }
     }
 
     //! Applies the body to the provided input
     //  then decides if more work is available
-    graph_task* apply_body_bypass( const input_type &i ) {
-        return static_cast<ImplType *>(this)->apply_body_impl_bypass(i);
+    graph_task* apply_body_bypass( const input_type &i
+                                   __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+
+    {
+        return static_cast<ImplType *>(this)->apply_body_impl_bypass(i __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
     }
 
     //! allocates a task to apply a body
-    graph_task* create_body_task( const input_type &input ) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    template <typename Metainfo>
+    graph_task* create_body_task( const input_type &input, Metainfo&& metainfo )
+#else
+    graph_task* create_body_task( const input_type &input )
+#endif
+    {
         if (!is_graph_active(my_graph_ref)) {
             return nullptr;
         }
         // TODO revamp: extract helper for common graph task allocation part
-        small_object_allocator allocator{};
-        typedef apply_body_task_bypass<class_type, input_type> task_type;
-        graph_task* t = allocator.new_object<task_type>( my_graph_ref, allocator, *this, input, my_priority );
-        graph_reference().reserve_wait();
+        d1::small_object_allocator allocator{};
+        graph_task* t = nullptr;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        if (!metainfo.empty()) {
+            using task_type = apply_body_task_bypass<class_type, input_type, trackable_messages_graph_task>;
+            t = allocator.new_object<task_type>(my_graph_ref, allocator, *this, input, my_priority, std::forward<Metainfo>(metainfo));
+        } else
+#endif
+        {
+            using task_type = apply_body_task_bypass<class_type, input_type>;
+            t = allocator.new_object<task_type>(my_graph_ref, allocator, *this, input, my_priority);
+        }
         return t;
     }
 
@@ -327,10 +400,9 @@ class function_input_base : public receiver<Input>, no_assign {
         if (!is_graph_active(my_graph_ref)) {
             return nullptr;
         }
-        small_object_allocator allocator{};
+        d1::small_object_allocator allocator{};
         typedef forward_task_bypass<class_type> task_type;
         graph_task* t = allocator.new_object<task_type>( graph_reference(), allocator, *this, my_priority );
-        graph_reference().reserve_wait();
         return t;
     }
 
@@ -398,7 +470,9 @@ class function_input : public function_input_base<Input, Policy, A, function_inp
     }
 
     //TODO: consider moving into the base class
-    graph_task* apply_body_impl_bypass( const input_type &i) {
+    graph_task* apply_body_impl_bypass( const input_type &i
+                                        __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
         output_type v = apply_body_impl(i);
         graph_task* postponed_task = nullptr;
         if( base_type::my_max_concurrency != 0 ) {
@@ -410,7 +484,7 @@ class function_input : public function_input_base<Input, Policy, A, function_inp
             // execution policy
             spawn_in_graph_arena(base_type::graph_reference(), *postponed_task);
         }
-        graph_task* successor_task = successors().try_put_task(v);
+        graph_task* successor_task = successors().try_put_task(v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
 #if _MSC_VER && !__INTEL_COMPILER
 #pragma warning (push)
 #pragma warning (disable: 4127)  /* suppress conditional expression is constant */
@@ -524,7 +598,9 @@ class multifunction_input : public function_input_base<Input, Policy, A, multifu
     // for multifunction nodes we do not have a single successor as such.  So we just tell
     // the task we were successful.
     //TODO: consider moving common parts with implementation in function_input into separate function
-    graph_task* apply_body_impl_bypass( const input_type &i ) {
+    graph_task* apply_body_impl_bypass( const input_type &i
+                                        __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo&) )
+    {
         fgt_begin_body( my_body );
         (*my_body)(i, my_output_ports);
         fgt_end_body( my_body );
@@ -578,6 +654,18 @@ struct emit_element {
         check_task_and_spawn(g, last_task);
         return emit_element<N-1>::emit_this(g,t,p);
     }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    template <typename TupleType, typename PortsType>
+    static graph_task* emit_this(graph& g, const TupleType& t, PortsType& p,
+                                 const message_metainfo& metainfo)
+    {
+        // TODO: consider to collect all the tasks in task_list and spawn them all at once
+        graph_task* last_task = std::get<N-1>(p).try_put_task(std::get<N-1>(t), metainfo);
+        check_task_and_spawn(g, last_task);
+        return emit_element<N-1>::emit_this(g, t, p, metainfo);
+    }
+#endif
 };
 
 template<>
@@ -588,6 +676,17 @@ struct emit_element<1> {
         check_task_and_spawn(g, last_task);
         return SUCCESSFULLY_ENQUEUED;
     }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    template <typename TupleType, typename PortsType>
+    static graph_task* emit_this(graph& g, const TupleType& t, PortsType& ports,
+                                 const message_metainfo& metainfo)
+    {
+        graph_task* last_task = std::get<0>(ports).try_put_task(std::get<0>(t), metainfo);
+        check_task_and_spawn(g, last_task);
+        return SUCCESSFULLY_ENQUEUED;
+    }
+#endif
 };
 
 //! Implements methods for an executable node that takes continue_msg as input
@@ -654,18 +753,25 @@ class continue_input : public continue_receiver {
     virtual broadcast_cache<output_type > &successors() = 0;
 
     friend class apply_body_task_bypass< class_type, continue_msg >;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    friend class apply_body_task_bypass< class_type, continue_msg, trackable_messages_graph_task >;
+#endif
 
     //! Applies the body to the provided input
-    graph_task* apply_body_bypass( input_type ) {
+    graph_task* apply_body_bypass( input_type __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) {
         // There is an extra copied needed to capture the
         // body execution without the try_put
         fgt_begin_body( my_body );
         output_type v = (*my_body)( continue_msg() );
         fgt_end_body( my_body );
-        return successors().try_put_task( v );
+        return successors().try_put_task( v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo) );
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* execute(const message_metainfo& metainfo) override {
+#else
     graph_task* execute() override {
+#endif
         if(!is_graph_active(my_graph_ref)) {
             return nullptr;
         }
@@ -677,13 +783,21 @@ class continue_input : public continue_receiver {
 #if _MSC_VER && !__INTEL_COMPILER
 #pragma warning (pop)
 #endif
-            return apply_body_bypass( continue_msg() );
+            return apply_body_bypass( continue_msg() __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo) );
         }
         else {
-            small_object_allocator allocator{};
-            typedef apply_body_task_bypass<class_type, continue_msg> task_type;
-            graph_task* t = allocator.new_object<task_type>( graph_reference(), allocator, *this, continue_msg(), my_priority );
-            graph_reference().reserve_wait();
+            d1::small_object_allocator allocator{};
+            graph_task* t = nullptr;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            if (!metainfo.empty()) {
+                using task_type = apply_body_task_bypass<class_type, continue_msg, trackable_messages_graph_task>;
+                t = allocator.new_object<task_type>( graph_reference(), allocator, *this, continue_msg(), my_priority, metainfo );
+            } else
+#endif
+            {
+                using task_type = apply_body_task_bypass<class_type, continue_msg>;
+                t = allocator.new_object<task_type>( graph_reference(), allocator, *this, continue_msg(), my_priority );
+            }
             return t;
         }
     }
@@ -755,6 +869,12 @@ class multifunction_output : public function_output<Output> {
         return my_successors.try_put_task(i);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const output_type& i, const message_metainfo& metainfo) {
+        return my_successors.try_put_task(i, metainfo);
+    }
+#endif
+
     template <int N> friend struct emit_element;
 
 };  // multifunction_output
diff --git a/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h b/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h
index ce867121f9..8440bd7008 100644
--- a/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2021 Intel Corporation
+    Copyright (c) 2020-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// Included in namespace tbb::detail::d1 (in flow_graph.h)
+// Included in namespace tbb::detail::d2 (in flow_graph.h)
 
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
 // Visual Studio 2019 reports an error while calling predecessor_selector::get and successor_selector::get
diff --git a/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h b/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h
index 8c20993795..47ecfb2a84 100644
--- a/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h
+++ b/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 template <typename Input, typename Output>
 struct declare_body_types {
@@ -51,10 +51,10 @@ template <typename T, typename Input, typename Output>
 struct body_types<Output (T::*)(Input&)> : declare_body_types<Input, Output> {};
 
 template <typename T, typename Output>
-struct body_types<Output (T::*)(flow_control&) const> : declare_body_types<NoInputBody, Output> {};
+struct body_types<Output (T::*)(d1::flow_control&) const> : declare_body_types<NoInputBody, Output> {};
 
 template <typename T, typename Output>
-struct body_types<Output (T::*)(flow_control&)> : declare_body_types<NoInputBody, Output> {};
+struct body_types<Output (T::*)(d1::flow_control&)> : declare_body_types<NoInputBody, Output> {};
 
 template <typename Input, typename Output>
 struct body_types<Output (*)(Input&)> : declare_body_types<Input, Output> {};
@@ -63,7 +63,7 @@ template <typename Input, typename Output>
 struct body_types<Output (*)(const Input&)> : declare_body_types<Input, Output> {};
 
 template <typename Output>
-struct body_types<Output (*)(flow_control&)> : declare_body_types<NoInputBody, Output> {};
+struct body_types<Output (*)(d1::flow_control&)> : declare_body_types<NoInputBody, Output> {};
 
 template <typename Body>
 using input_t = typename body_types<Body>::input_type;
@@ -100,7 +100,7 @@ decltype(decide_on_operator_overload(std::declval<Body>())) decide_on_callable_t
 template <typename GraphOrSet, typename Body>
 input_node(GraphOrSet&&, Body)
 ->input_node<output_t<decltype(decide_on_callable_type<Body>(0))>>;
-    
+
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
 
 template <typename NodeSet>
@@ -268,7 +268,7 @@ template <typename NodeSet>
 write_once_node(const NodeSet&)
 ->write_once_node<decide_on_set_t<NodeSet>>;
 #endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
-} // namespace d1
+} // namespace d2
 } // namespace detail
 } // namespace tbb
 
diff --git a/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h b/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h
index 0d9de17654..0f7c0d174f 100644
--- a/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,32 +30,88 @@
 
 // elements in the table are a simple list; we need pointer to next element to
 // traverse the chain
-template<typename ValueType>
-struct buffer_element_type {
-    // the second parameter below is void * because we can't forward-declare the type
-    // itself, so we just reinterpret_cast below.
-    typedef typename aligned_pair<ValueType, void *>::type type;
+
+template <typename Key, typename ValueType>
+struct hash_buffer_element : public aligned_pair<ValueType, void*> {
+    using key_type = Key;
+    using value_type = ValueType;
+
+    value_type* get_value_ptr() { return reinterpret_cast<value_type*>(this->first); }
+    hash_buffer_element* get_next() { return reinterpret_cast<hash_buffer_element*>(this->second); }
+    void set_next(hash_buffer_element* new_next) { this->second = reinterpret_cast<void*>(new_next); }
+
+    void create_element(const value_type& v) {
+        ::new(this->first) value_type(v);
+    }
+
+    void create_element(hash_buffer_element&& other) {
+        ::new(this->first) value_type(std::move(*other.get_value_ptr()));
+    }
+
+    void destroy_element() {
+        get_value_ptr()->~value_type();
+    }
+};
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+template <typename Key, typename ValueType>
+struct metainfo_hash_buffer_element : public aligned_triple<ValueType, void*, message_metainfo> {
+    using key_type = Key;
+    using value_type = ValueType;
+
+    value_type* get_value_ptr() { return reinterpret_cast<value_type*>(this->first); }
+    metainfo_hash_buffer_element* get_next() {
+        return reinterpret_cast<metainfo_hash_buffer_element*>(this->second);
+    }
+    void set_next(metainfo_hash_buffer_element* new_next) { this->second = reinterpret_cast<void*>(new_next); }
+    message_metainfo& get_metainfo() { return this->third; }
+
+    void create_element(const value_type& v, const message_metainfo& metainfo) {
+        __TBB_ASSERT(this->third.empty(), nullptr);
+        ::new(this->first) value_type(v);
+        this->third = metainfo;
+
+        for (auto waiter : metainfo.waiters()) {
+            waiter->reserve(1);
+        }
+    }
+
+    void create_element(metainfo_hash_buffer_element&& other) {
+        __TBB_ASSERT(this->third.empty(), nullptr);
+        ::new(this->first) value_type(std::move(*other.get_value_ptr()));
+        this->third = std::move(other.get_metainfo());
+    }
+
+    void destroy_element() {
+        get_value_ptr()->~value_type();
+
+        for (auto waiter : get_metainfo().waiters()) {
+            waiter->release(1);
+        }
+        get_metainfo() = message_metainfo{};
+    }
 };
+#endif
 
 template
     <
-     typename Key,         // type of key within ValueType
-     typename ValueType,
+     typename ElementType,
      typename ValueToKey,  // abstract method that returns "const Key" or "const Key&" given ValueType
      typename HashCompare, // has hash and equal
-     typename Allocator=tbb::cache_aligned_allocator< typename aligned_pair<ValueType, void *>::type >
+     typename Allocator=tbb::cache_aligned_allocator<ElementType>
     >
-class hash_buffer : public HashCompare {
+class hash_buffer_impl : public HashCompare {
 public:
     static const size_t INITIAL_SIZE = 8;  // initial size of the hash pointer table
-    typedef ValueType value_type;
-    typedef typename buffer_element_type< value_type >::type element_type;
+    typedef typename ElementType::key_type key_type;
+    typedef typename ElementType::value_type value_type;
+    typedef ElementType element_type;
     typedef value_type *pointer_type;
     typedef element_type *list_array_type;  // array we manage manually
     typedef list_array_type *pointer_array_type;
     typedef typename std::allocator_traits<Allocator>::template rebind_alloc<list_array_type> pointer_array_allocator_type;
     typedef typename std::allocator_traits<Allocator>::template rebind_alloc<element_type> elements_array_allocator;
-    typedef typename std::decay<Key>::type Knoref;
+    typedef typename std::decay<key_type>::type Knoref;
 
 private:
     ValueToKey *my_key;
@@ -69,9 +125,9 @@ class hash_buffer : public HashCompare {
 
     void set_up_free_list( element_type **p_free_list, list_array_type la, size_t sz) {
         for(size_t i=0; i < sz - 1; ++i ) {  // construct free list
-            la[i].second = &(la[i+1]);
+            la[i].set_next(&(la[i + 1]));
         }
-        la[sz-1].second = nullptr;
+        la[sz - 1].set_next(nullptr);
         *p_free_list = (element_type *)&(la[0]);
     }
 
@@ -101,15 +157,18 @@ class hash_buffer : public HashCompare {
         {
             DoCleanup my_cleanup(new_pointer_array, new_elements_array, new_size);
             new_elements_array = elements_array_allocator().allocate(my_size);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            for (std::size_t i = 0; i < my_size; ++i) {
+                ::new(new_elements_array + i) element_type();
+            }
+#endif
             new_pointer_array = pointer_array_allocator_type().allocate(new_size);
             for(size_t i=0; i < new_size; ++i) new_pointer_array[i] = nullptr;
             set_up_free_list(&new_free_list, new_elements_array, my_size );
 
             for(size_t i=0; i < my_size; ++i) {
-                for( element_type* op = pointer_array[i]; op; op = (element_type *)(op->second)) {
-                    value_type *ov = reinterpret_cast<value_type *>(&(op->first));
-                    // could have std::move semantics
-                    internal_insert_with_key(new_pointer_array, new_size, new_free_list, *ov);
+                for( element_type* op = pointer_array[i]; op; op = (element_type *)(op->get_next())) {
+                    internal_insert_with_key(new_pointer_array, new_size, new_free_list, std::move(*op));
                 }
             }
             my_cleanup.my_pa = nullptr;
@@ -126,15 +185,26 @@ class hash_buffer : public HashCompare {
 
     // v should have perfect forwarding if std::move implemented.
     // we use this method to move elements in grow_array, so can't use class fields
+    template <typename Value, typename... Args>
+    const value_type& get_value_from_pack(const Value& value, const Args&...) {
+        return value;
+    }
+
+    template <typename Element>
+    const value_type& get_value_from_pack(Element&& element) {
+        return *(element.get_value_ptr());
+    }
+
+    template <typename... Args>
     void internal_insert_with_key( element_type **p_pointer_array, size_t p_sz, list_array_type &p_free_list,
-            const value_type &v) {
+                                   Args&&... args) {
         size_t l_mask = p_sz-1;
         __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
-        size_t h = this->hash(tbb::detail::invoke(*my_key, v)) & l_mask;
+        size_t h = this->hash(tbb::detail::invoke(*my_key, get_value_from_pack(args...))) & l_mask;
         __TBB_ASSERT(p_free_list, "Error: free list not set up.");
-        element_type* my_elem = p_free_list; p_free_list = (element_type *)(p_free_list->second);
-        (void) new(&(my_elem->first)) value_type(v);
-        my_elem->second = p_pointer_array[h];
+        element_type* my_elem = p_free_list; p_free_list = (element_type *)(p_free_list->get_next());
+        my_elem->create_element(std::forward<Args>(args)...);
+        my_elem->set_next(p_pointer_array[h]);
         p_pointer_array[h] = my_elem;
     }
 
@@ -142,6 +212,11 @@ class hash_buffer : public HashCompare {
         pointer_array = pointer_array_allocator_type().allocate(my_size);
         for(size_t i = 0; i < my_size; ++i) pointer_array[i] = nullptr;
         elements_array = elements_array_allocator().allocate(my_size / 2);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        for (std::size_t i = 0; i < my_size / 2; ++i) {
+            ::new(elements_array + i) element_type();
+        }
+#endif
         set_up_free_list(&free_list, elements_array, my_size / 2);
     }
 
@@ -151,13 +226,8 @@ class hash_buffer : public HashCompare {
             for(size_t i = 0; i < sz; ++i ) {
                 element_type *p_next;
                 for( element_type *p = pa[i]; p; p = p_next) {
-                    p_next = (element_type *)p->second;
-                    // TODO revamp: make sure type casting is correct.
-                    void* ptr = (void*)(p->first);
-#if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
-                    suppress_unused_warning(ptr);
-#endif
-                    ((value_type*)ptr)->~value_type();
+                    p_next = p->get_next();
+                    p->destroy_element();
                 }
             }
             pointer_array_allocator_type().deallocate(pa, sz);
@@ -166,6 +236,11 @@ class hash_buffer : public HashCompare {
         // Separate test (if allocation of pa throws, el may be allocated.
         // but no elements will be constructed.)
         if(el) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            for (std::size_t i = 0; i < sz / 2; ++i) {
+                (el + i)->~element_type();
+            }
+#endif
             elements_array_allocator().deallocate(el, sz / 2);
             el = nullptr;
         }
@@ -174,17 +249,17 @@ class hash_buffer : public HashCompare {
     }
 
 public:
-    hash_buffer() : my_key(nullptr), my_size(INITIAL_SIZE), nelements(0) {
+    hash_buffer_impl() : my_key(nullptr), my_size(INITIAL_SIZE), nelements(0) {
         internal_initialize_buffer();
     }
 
-    ~hash_buffer() {
+    ~hash_buffer_impl() {
         internal_free_buffer(pointer_array, elements_array, my_size, nelements);
         delete my_key;
         my_key = nullptr;
     }
-    hash_buffer(const hash_buffer&) = delete;
-    hash_buffer& operator=(const hash_buffer&) = delete;
+    hash_buffer_impl(const hash_buffer_impl&) = delete;
+    hash_buffer_impl& operator=(const hash_buffer_impl&) = delete;
 
     void reset() {
         internal_free_buffer(pointer_array, elements_array, my_size, nelements);
@@ -197,34 +272,41 @@ class hash_buffer : public HashCompare {
     // pointer is used to clone()
     ValueToKey* get_key_func() { return my_key; }
 
-    bool insert_with_key(const value_type &v) {
-        pointer_type p = nullptr;
+    template <typename... Args>
+    bool insert_with_key(const value_type &v, Args&&... args) {
+        element_type* p = nullptr;
         __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
-        if(find_ref_with_key(tbb::detail::invoke(*my_key, v), p)) {
-            p->~value_type();
-            (void) new(p) value_type(v);  // copy-construct into the space
+        if(find_element_ref_with_key(tbb::detail::invoke(*my_key, v), p)) {
+            p->destroy_element();
+            p->create_element(v, std::forward<Args>(args)...);
             return false;
         }
         ++nelements;
         if(nelements*2 > my_size) grow_array();
-        internal_insert_with_key(pointer_array, my_size, free_list, v);
+        internal_insert_with_key(pointer_array, my_size, free_list, v, std::forward<Args>(args)...);
         return true;
     }
 
-    // returns true and sets v to array element if found, else returns false.
-    bool find_ref_with_key(const Knoref& k, pointer_type &v) {
+    bool find_element_ref_with_key(const Knoref& k, element_type*& v) {
         size_t i = this->hash(k) & mask();
-        for(element_type* p = pointer_array[i]; p; p = (element_type *)(p->second)) {
-            pointer_type pv = reinterpret_cast<pointer_type>(&(p->first));
+        for(element_type* p = pointer_array[i]; p; p = (element_type *)(p->get_next())) {
             __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
-            if(this->equal(tbb::detail::invoke(*my_key, *pv), k)) {
-                v = pv;
+            if(this->equal(tbb::detail::invoke(*my_key, *p->get_value_ptr()), k)) {
+                v = p;
                 return true;
             }
         }
         return false;
     }
 
+    // returns true and sets v to array element if found, else returns false.
+    bool find_ref_with_key(const Knoref& k, pointer_type &v) {
+        element_type* element_ptr = nullptr;
+        bool res = find_element_ref_with_key(k, element_ptr);
+        v = element_ptr->get_value_ptr();
+        return res;
+    }
+
     bool find_with_key( const Knoref& k, value_type &v) {
         value_type *p;
         if(find_ref_with_key(k, p)) {
@@ -238,14 +320,14 @@ class hash_buffer : public HashCompare {
     void delete_with_key(const Knoref& k) {
         size_t h = this->hash(k) & mask();
         element_type* prev = nullptr;
-        for(element_type* p = pointer_array[h]; p; prev = p, p = (element_type *)(p->second)) {
-            value_type *vp = reinterpret_cast<value_type *>(&(p->first));
+        for(element_type* p = pointer_array[h]; p; prev = p, p = (element_type *)(p->get_next())) {
+            value_type *vp = p->get_value_ptr();
             __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
             if(this->equal(tbb::detail::invoke(*my_key, *vp), k)) {
-                vp->~value_type();
-                if(prev) prev->second = p->second;
-                else pointer_array[h] = (element_type *)(p->second);
-                p->second = free_list;
+                p->destroy_element();
+                if(prev) prev->set_next(p->get_next());
+                else pointer_array[h] = (element_type *)(p->get_next());
+                p->set_next(free_list);
                 free_list = p;
                 --nelements;
                 return;
@@ -254,4 +336,45 @@ class hash_buffer : public HashCompare {
         __TBB_ASSERT(false, "key not found for delete");
     }
 };
+
+template
+    <
+     typename Key,         // type of key within ValueType
+     typename ValueType,
+     typename ValueToKey,  // abstract method that returns "const Key" or "const Key&" given ValueType
+     typename HashCompare, // has hash and equal
+     typename Allocator=tbb::cache_aligned_allocator<hash_buffer_element<Key, ValueType>>
+    >
+using hash_buffer = hash_buffer_impl<hash_buffer_element<Key, ValueType>,
+                                     ValueToKey, HashCompare, Allocator>;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+template
+    <
+     typename Key,         // type of key within ValueType
+     typename ValueType,
+     typename ValueToKey,  // abstract method that returns "const Key" or "const Key&" given ValueType
+     typename HashCompare, // has hash and equal
+     typename Allocator=tbb::cache_aligned_allocator<metainfo_hash_buffer_element<Key, ValueType>>
+    >
+struct metainfo_hash_buffer : public hash_buffer_impl<metainfo_hash_buffer_element<Key, ValueType>,
+                                               ValueToKey, HashCompare, Allocator>
+{
+private:
+    using base_type = hash_buffer_impl<metainfo_hash_buffer_element<Key, ValueType>,
+                                       ValueToKey, HashCompare, Allocator>;
+public:
+    bool find_with_key(const typename base_type::Knoref& k,
+                       typename base_type::value_type& v, message_metainfo& metainfo)
+    {
+        typename base_type::element_type* p = nullptr;
+        bool result = this->find_element_ref_with_key(k, p);
+        if (result) {
+            v = *(p->get_value_ptr());
+            metainfo = p->get_metainfo();
+        }
+        return result;
+    }
+};
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
 #endif // __TBB__flow_graph_hash_buffer_impl_H
diff --git a/include/oneapi/tbb/detail/_flow_graph_trace_impl.h b/include/oneapi/tbb/detail/_flow_graph_trace_impl.h
index a161dd0362..74ebf08456 100644
--- a/include/oneapi/tbb/detail/_flow_graph_trace_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_trace_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 template< typename T > class sender;
 template< typename T > class receiver;
@@ -44,29 +44,29 @@ template< typename T > class receiver;
 
 static inline void fgt_alias_port(void *node, void *p, bool visible) {
     if(visible)
-        itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_NODE );
+        itt_relation_add( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_NODE );
     else
-        itt_relation_add( ITT_DOMAIN_FLOW, p, FLOW_NODE, __itt_relation_is_child_of, node, FLOW_NODE );
+        itt_relation_add( d1::ITT_DOMAIN_FLOW, p, FLOW_NODE, __itt_relation_is_child_of, node, FLOW_NODE );
 }
 
 static inline void fgt_composite ( void* codeptr, void *node, void *graph ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_COMPOSITE_NODE );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_COMPOSITE_NODE );
     suppress_unused_warning( codeptr );
 #if __TBB_FLOW_TRACE_CODEPTR
     if (codeptr != nullptr) {
-        register_node_addr(ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
+        register_node_addr(d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
     }
 #endif
 }
 
 static inline void fgt_internal_alias_input_port( void *node, void *p, string_resource_index name_index ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
-    itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_INPUT_PORT );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
+    itt_relation_add( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_INPUT_PORT );
 }
 
 static inline void fgt_internal_alias_output_port( void *node, void *p, string_resource_index name_index ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index );
-    itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_OUTPUT_PORT );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index );
+    itt_relation_add( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_OUTPUT_PORT );
 }
 
 template<typename InputType>
@@ -109,15 +109,15 @@ struct fgt_internal_output_alias_helper<PortsTuple, 0> {
 };
 
 static inline void fgt_internal_create_input_port( void *node, void *p, string_resource_index name_index ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
 }
 
 static inline void fgt_internal_create_output_port( void* codeptr, void *node, void *p, string_resource_index name_index ) {
-    itt_make_task_group(ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index);
+    itt_make_task_group(d1::ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index);
     suppress_unused_warning( codeptr );
 #if __TBB_FLOW_TRACE_CODEPTR
     if (codeptr != nullptr) {
-        register_node_addr(ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
+        register_node_addr(d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
     }
 #endif
 }
@@ -167,40 +167,40 @@ struct fgt_internal_output_helper<PortsTuple,1> {
 template< typename NodeType >
 void fgt_multioutput_node_desc( const NodeType *node, const char *desc ) {
     void *addr =  (void *)( static_cast< receiver< typename NodeType::input_type > * >(const_cast< NodeType *>(node)) );
-    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+    itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
 }
 
 template< typename NodeType >
 void fgt_multiinput_multioutput_node_desc( const NodeType *node, const char *desc ) {
     void *addr =  const_cast<NodeType *>(node);
-    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+    itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
 }
 
 template< typename NodeType >
 static inline void fgt_node_desc( const NodeType *node, const char *desc ) {
     void *addr =  (void *)( static_cast< sender< typename NodeType::output_type > * >(const_cast< NodeType *>(node)) );
-    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+    itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
 }
 
 static inline void fgt_graph_desc( const void *g, const char *desc ) {
     void *addr = const_cast< void *>(g);
-    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_GRAPH, FLOW_OBJECT_NAME, desc );
+    itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_GRAPH, FLOW_OBJECT_NAME, desc );
 }
 
 static inline void fgt_body( void *node, void *body ) {
-    itt_relation_add( ITT_DOMAIN_FLOW, body, FLOW_BODY, __itt_relation_is_child_of, node, FLOW_NODE );
+    itt_relation_add( d1::ITT_DOMAIN_FLOW, body, FLOW_BODY, __itt_relation_is_child_of, node, FLOW_NODE );
 }
 
 template< int N, typename PortsTuple >
 static inline void fgt_multioutput_node(void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
     fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 );
     fgt_internal_output_helper<PortsTuple, N>::register_port(codeptr, input_port, ports );
 }
 
 template< int N, typename PortsTuple >
 static inline void fgt_multioutput_node_with_body( void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports, void *body ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
     fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 );
     fgt_internal_output_helper<PortsTuple, N>::register_port( codeptr, input_port, ports );
     fgt_body( input_port, body );
@@ -208,28 +208,28 @@ static inline void fgt_multioutput_node_with_body( void* codeptr, string_resourc
 
 template< int N, typename PortsTuple >
 static inline void fgt_multiinput_node( void* codeptr, string_resource_index t, void *g, PortsTuple &ports, void *output_port) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
     fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
     fgt_internal_input_helper<PortsTuple, N>::register_port( output_port, ports );
 }
 
 static inline void fgt_multiinput_multioutput_node( void* codeptr, string_resource_index t, void *n, void *g ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, n, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, n, FLOW_NODE, g, FLOW_GRAPH, t );
     suppress_unused_warning( codeptr );
 #if __TBB_FLOW_TRACE_CODEPTR
     if (codeptr != nullptr) {
-        register_node_addr(ITT_DOMAIN_FLOW, n, FLOW_NODE, CODE_ADDRESS, &codeptr);
+        register_node_addr(d1::ITT_DOMAIN_FLOW, n, FLOW_NODE, CODE_ADDRESS, &codeptr);
     }
 #endif
 }
 
 static inline void fgt_node( void* codeptr, string_resource_index t, void *g, void *output_port ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
     fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
 }
 
 static void fgt_node_with_body( void* codeptr, string_resource_index t, void *g, void *output_port, void *body ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
     fgt_internal_create_output_port(codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
     fgt_body( output_port, body );
 }
@@ -251,47 +251,47 @@ static inline void  fgt_node( void* codeptr, string_resource_index t, void *g, v
 }
 
 static inline void fgt_make_edge( void *output_port, void *input_port ) {
-    itt_relation_add( ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_predecessor_to, input_port, FLOW_INPUT_PORT);
+    itt_relation_add( d1::ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_predecessor_to, input_port, FLOW_INPUT_PORT);
 }
 
 static inline void fgt_remove_edge( void *output_port, void *input_port ) {
-    itt_relation_add( ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_sibling_of, input_port, FLOW_INPUT_PORT);
+    itt_relation_add( d1::ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_sibling_of, input_port, FLOW_INPUT_PORT);
 }
 
 static inline void fgt_graph( void *g ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, g, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_GRAPH );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, g, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_GRAPH );
 }
 
 static inline void fgt_begin_body( void *body ) {
-    itt_task_begin( ITT_DOMAIN_FLOW, body, FLOW_BODY, nullptr, FLOW_NULL, FLOW_BODY );
+    itt_task_begin( d1::ITT_DOMAIN_FLOW, body, FLOW_BODY, nullptr, FLOW_NULL, FLOW_BODY );
 }
 
 static inline void fgt_end_body( void * ) {
-    itt_task_end( ITT_DOMAIN_FLOW );
+    itt_task_end( d1::ITT_DOMAIN_FLOW );
 }
 
 static inline void fgt_async_try_put_begin( void *node, void *port ) {
-    itt_task_begin( ITT_DOMAIN_FLOW, port, FLOW_OUTPUT_PORT, node, FLOW_NODE, FLOW_OUTPUT_PORT );
+    itt_task_begin( d1::ITT_DOMAIN_FLOW, port, FLOW_OUTPUT_PORT, node, FLOW_NODE, FLOW_OUTPUT_PORT );
 }
 
 static inline void fgt_async_try_put_end( void *, void * ) {
-    itt_task_end( ITT_DOMAIN_FLOW );
+    itt_task_end( d1::ITT_DOMAIN_FLOW );
 }
 
 static inline void fgt_async_reserve( void *node, void *graph ) {
-    itt_region_begin( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_NULL );
+    itt_region_begin( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_NULL );
 }
 
 static inline void fgt_async_commit( void *node, void * /*graph*/) {
-    itt_region_end( ITT_DOMAIN_FLOW, node, FLOW_NODE );
+    itt_region_end( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE );
 }
 
 static inline void fgt_reserve_wait( void *graph ) {
-    itt_region_begin( ITT_DOMAIN_FLOW, graph, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_NULL );
+    itt_region_begin( d1::ITT_DOMAIN_FLOW, graph, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_NULL );
 }
 
 static inline void fgt_release_wait( void *graph ) {
-    itt_region_end( ITT_DOMAIN_FLOW, graph, FLOW_GRAPH );
+    itt_region_end( d1::ITT_DOMAIN_FLOW, graph, FLOW_GRAPH );
 }
 
 #else // TBB_USE_PROFILING_TOOLS
@@ -357,7 +357,7 @@ struct fgt_internal_output_alias_helper {
 
 #endif // TBB_USE_PROFILING_TOOLS
 
-} // d1
+} // d2
 } // namespace detail
 } // namespace tbb
 
diff --git a/include/oneapi/tbb/detail/_flow_graph_types_impl.h b/include/oneapi/tbb/detail/_flow_graph_types_impl.h
index 4827551d85..e361b23e7b 100644
--- a/include/oneapi/tbb/detail/_flow_graph_types_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_types_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// included in namespace tbb::detail::d1
+// included in namespace tbb::detail::d2
 
 // the change to key_matching (adding a K and KHash template parameter, making it a class)
 // means we have to pass this data to the key_matching_port.  All the ports have only one
@@ -73,40 +73,55 @@ struct make_sequence < 0, S... > {
     typedef sequence<S...> type;
 };
 
-//! type mimicking std::pair but with trailing fill to ensure each element of an array
-//* will have the correct alignment
-template<typename T1, typename T2, size_t REM>
-struct type_plus_align {
-    char first[sizeof(T1)];
-    T2 second;
-    char fill1[REM];
+template<class U> struct alignment_of {
+    typedef struct { char t; U    padded; } test_alignment;
+    static const size_t value = sizeof(test_alignment) - sizeof(U);
 };
 
-template<typename T1, typename T2>
-struct type_plus_align<T1,T2,0> {
-    char first[sizeof(T1)];
-    T2 second;
+template <typename... Types>
+struct max_alignment_helper;
+
+template <typename T1, typename... Types>
+struct max_alignment_helper<T1, Types...> {
+    using type = typename max_alignment_helper<T1, typename max_alignment_helper<Types...>::type>::type;
 };
 
-template<class U> struct alignment_of {
-    typedef struct { char t; U    padded; } test_alignment;
-    static const size_t value = sizeof(test_alignment) - sizeof(U);
+template <typename T1, typename T2>
+struct max_alignment_helper<T1, T2> {
+    using type = typename std::conditional<alignof(T1) < alignof(T2), T2, T1>::type;
 };
 
+template <typename... Types>
+using max_alignment_helper_t = typename max_alignment_helper<Types...>::type;
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(push)
+#pragma warning(disable: 4324) // warning C4324: structure was padded due to alignment specifier
+#endif
+
 // T1, T2 are actual types stored.  The space defined for T1 in the type returned
 // is a char array of the correct size.  Type T2 should be trivially-constructible,
 // T1 must be explicitly managed.
-template<typename T1, typename T2>
-struct aligned_pair {
-    static const size_t t1_align = alignment_of<T1>::value;
-    static const size_t t2_align = alignment_of<T2>::value;
-    typedef type_plus_align<T1, T2, 0 > just_pair;
-    static const size_t max_align = t1_align < t2_align ? t2_align : t1_align;
-    static const size_t extra_bytes = sizeof(just_pair) % max_align;
-    static const size_t remainder = extra_bytes ? max_align - extra_bytes : 0;
-public:
-    typedef type_plus_align<T1,T2,remainder> type;
-};  // aligned_pair
+
+template <typename T1, typename T2>
+struct alignas(alignof(max_alignment_helper_t<T1, T2>)) aligned_pair {
+    char first[sizeof(T1)];
+    T2 second;
+};
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+template <typename T1, typename T2, typename T3>
+struct alignas(alignof(max_alignment_helper_t<T1, T2, T3>)) aligned_triple {
+    char first[sizeof(T1)];
+    T2 second;
+    T3 third;
+};
+#endif
+
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(pop) // warning 4324 is back
+#endif
 
 // support for variant type
 // type we use when we're not storing a value
diff --git a/include/oneapi/tbb/detail/_pipeline_filters.h b/include/oneapi/tbb/detail/_pipeline_filters.h
index 46e7b95d6c..8121946729 100644
--- a/include/oneapi/tbb/detail/_pipeline_filters.h
+++ b/include/oneapi/tbb/detail/_pipeline_filters.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -32,6 +32,12 @@ namespace d1 {
 class base_filter;
 }
 
+namespace d2 {
+template <typename Output>
+__TBB_requires(std::copyable<Output>)
+class input_node;
+}
+
 namespace r1 {
 TBB_EXPORT void __TBB_EXPORTED_FUNC set_end_of_input(d1::base_filter&);
 class pipeline;
@@ -131,7 +137,7 @@ class flow_control {
     template<typename Body, typename InputType, typename OutputType > friend class concrete_filter;
     template<typename Output>
     __TBB_requires(std::copyable<Output>)
-    friend class input_node;
+    friend class d2::input_node;
 public:
     void stop() { is_pipeline_stopped = true; }
 };
diff --git a/include/oneapi/tbb/detail/_task.h b/include/oneapi/tbb/detail/_task.h
index 636aea97b4..e1bb70c5be 100644
--- a/include/oneapi/tbb/detail/_task.h
+++ b/include/oneapi/tbb/detail/_task.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2023 Intel Corporation
+    Copyright (c) 2020-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -43,6 +43,13 @@ class task;
 class wait_context;
 class task_group_context;
 struct execution_data;
+class wait_tree_vertex_interface;
+class task_arena_base;
+}
+
+namespace d2 {
+class task_group;
+class task_group_base;
 }
 
 namespace r1 {
@@ -52,7 +59,9 @@ TBB_EXPORT void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& c
 TBB_EXPORT void __TBB_EXPORTED_FUNC execute_and_wait(d1::task& t, d1::task_group_context& t_ctx, d1::wait_context&, d1::task_group_context& w_ctx);
 TBB_EXPORT void __TBB_EXPORTED_FUNC wait(d1::wait_context&, d1::task_group_context& ctx);
 TBB_EXPORT d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::execution_data*);
+TBB_EXPORT d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::task_arena_base&);
 TBB_EXPORT d1::task_group_context* __TBB_EXPORTED_FUNC current_context();
+TBB_EXPORT d1::wait_tree_vertex_interface* get_thread_reference_vertex(d1::wait_tree_vertex_interface* wc);
 
 // Do not place under __TBB_RESUMABLE_TASKS. It is a stub for unsupported platforms.
 struct suspend_point_type;
@@ -124,8 +133,7 @@ class wait_context {
     friend class r1::thread_data;
     friend class r1::task_dispatcher;
     friend class r1::external_waiter;
-    friend class task_group;
-    friend class task_group_base;
+    friend class wait_context_vertex;
     friend struct r1::task_arena_impl;
     friend struct r1::suspend_point_type;
 public:
@@ -147,6 +155,67 @@ class wait_context {
     }
 };
 
+class wait_tree_vertex_interface {
+public:
+    virtual void reserve(std::uint32_t delta = 1) = 0;
+    virtual void release(std::uint32_t delta = 1) = 0;
+
+protected:
+    virtual ~wait_tree_vertex_interface() = default;
+};
+
+class wait_context_vertex : public wait_tree_vertex_interface {
+public:
+    wait_context_vertex(std::uint32_t ref = 0) : m_wait(ref) {}
+
+    void reserve(std::uint32_t delta = 1) override {
+        m_wait.reserve(delta);
+    }
+
+    void release(std::uint32_t delta = 1) override {
+        m_wait.release(delta);
+    }
+
+    wait_context& get_context() {
+        return m_wait;
+    }
+private:
+    friend class d2::task_group;
+    friend class d2::task_group_base;
+
+    bool continue_execution() const {
+        return m_wait.continue_execution();
+    }
+
+    wait_context m_wait;
+};
+
+class reference_vertex : public wait_tree_vertex_interface {
+public:
+    reference_vertex(wait_tree_vertex_interface* parent, std::uint32_t ref_count) : my_parent{parent}, m_ref_count{ref_count}
+    {}
+
+    void reserve(std::uint32_t delta = 1) override {
+        if (m_ref_count.fetch_add(static_cast<std::uint64_t>(delta)) == 0) {
+            my_parent->reserve();
+        }
+    }
+
+    void release(std::uint32_t delta = 1) override {
+        std::uint64_t ref = m_ref_count.fetch_sub(static_cast<std::uint64_t>(delta)) - static_cast<std::uint64_t>(delta);
+        if (ref == 0) {
+            my_parent->release();
+        }
+    }
+
+    std::uint32_t get_num_child() {
+        return static_cast<std::uint32_t>(m_ref_count.load(std::memory_order_acquire));
+    }
+private:
+    wait_tree_vertex_interface* my_parent;
+    std::atomic<std::uint64_t> m_ref_count;
+};
+
 struct execution_data {
     task_group_context* context{};
     slot_id original_slot{};
diff --git a/include/oneapi/tbb/detail/_task_handle.h b/include/oneapi/tbb/detail/_task_handle.h
index e32154f409..26212b462c 100644
--- a/include/oneapi/tbb/detail/_task_handle.h
+++ b/include/oneapi/tbb/detail/_task_handle.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2021 Intel Corporation
+    Copyright (c) 2020-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ class task_handle;
 
 class task_handle_task : public d1::task {
     std::uint64_t m_version_and_traits{};
-    d1::wait_context& m_wait_ctx;
+    d1::wait_tree_vertex_interface* m_wait_tree_vertex;
     d1::task_group_context& m_ctx;
     d1::small_object_allocator m_allocator;
 public:
@@ -46,15 +46,16 @@ class task_handle_task : public d1::task {
         }
     }
 
-    task_handle_task(d1::wait_context& wo, d1::task_group_context& ctx, d1::small_object_allocator& alloc)
-        : m_wait_ctx(wo)
+    task_handle_task(d1::wait_tree_vertex_interface* vertex, d1::task_group_context& ctx, d1::small_object_allocator& alloc)
+        : m_wait_tree_vertex(vertex)
         , m_ctx(ctx)
         , m_allocator(alloc) {
         suppress_unused_warning(m_version_and_traits);
+        m_wait_tree_vertex->reserve();
     }
 
     ~task_handle_task() override {
-        m_wait_ctx.release();
+        m_wait_tree_vertex->release();
     }
 
     d1::task_group_context& ctx() const { return m_ctx; }
diff --git a/include/oneapi/tbb/flow_graph.h b/include/oneapi/tbb/flow_graph.h
index 2df4b14050..20916fa7c2 100644
--- a/include/oneapi/tbb/flow_graph.h
+++ b/include/oneapi/tbb/flow_graph.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -52,6 +52,7 @@
 
 #include <tuple>
 #include <list>
+#include <forward_list>
 #include <queue>
 #if __TBB_CPP20_CONCEPTS_PRESENT
 #include <concepts>
@@ -70,7 +71,7 @@
 namespace tbb {
 namespace detail {
 
-namespace d1 {
+namespace d2 {
 
 //! An enumeration the provides the two most common concurrency levels: unlimited and serial
 enum concurrency { unlimited = 0, serial = 1 };
@@ -81,19 +82,19 @@ struct null_type {};
 //! An empty class used for messages that mean "I'm done"
 class continue_msg {};
 
-} // namespace d1
+} // namespace d2
 
 #if __TBB_CPP20_CONCEPTS_PRESENT
 namespace d0 {
 
 template <typename ReturnType, typename OutputType>
-concept node_body_return_type = std::same_as<OutputType, tbb::detail::d1::continue_msg> ||
+concept node_body_return_type = std::same_as<OutputType, tbb::detail::d2::continue_msg> ||
                                 std::convertible_to<OutputType, ReturnType>;
 
 // TODO: consider using std::invocable here
 template <typename Body, typename Output>
 concept continue_node_body = std::copy_constructible<Body> &&
-                             requires( Body& body, const tbb::detail::d1::continue_msg& v ) {
+                             requires( Body& body, const tbb::detail::d2::continue_msg& v ) {
                                  { body(v) } -> node_body_return_type<Output>;
                              };
 
@@ -129,7 +130,7 @@ concept async_node_body = std::copy_constructible<Body> &&
 } // namespace d0
 #endif // __TBB_CPP20_CONCEPTS_PRESENT
 
-namespace d1 {
+namespace d2 {
 
 //! Forward declaration section
 template< typename T > class sender;
@@ -153,7 +154,7 @@ template<typename Order, typename... Args> struct node_set;
 #endif
 
 
-} // namespace d1
+} // namespace d2
 } // namespace detail
 } // namespace tbb
 
@@ -162,7 +163,7 @@ template<typename Order, typename... Args> struct node_set;
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 static inline std::pair<graph_task*, graph_task*> order_tasks(graph_task* first, graph_task* second) {
     if (second->priority > first->priority)
@@ -187,6 +188,37 @@ static inline graph_task* combine_tasks(graph& g, graph_task* left, graph_task*
     return left;
 }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+class message_metainfo {
+public:
+    using waiters_type = std::forward_list<d1::wait_context_vertex*>;
+
+    message_metainfo() = default;
+
+    message_metainfo(const waiters_type& waiters) : my_waiters(waiters) {}
+    message_metainfo(waiters_type&& waiters) : my_waiters(std::move(waiters)) {}
+
+    const waiters_type& waiters() const & { return my_waiters; }
+    waiters_type&& waiters() && { return std::move(my_waiters); }
+
+    bool empty() const { return my_waiters.empty(); }
+
+    void merge(const message_metainfo& other) {
+        // TODO: should we avoid duplications on merging
+        my_waiters.insert_after(my_waiters.before_begin(),
+                                other.waiters().begin(),
+                                other.waiters().end());
+    }
+private:
+    waiters_type my_waiters;
+}; // class message_metainfo
+
+#define __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo) , metainfo
+
+#else
+#define __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Pure virtual template class that defines a sender of messages of type T
 template< typename T >
 class sender {
@@ -196,9 +228,17 @@ class sender {
     //! Request an item from the sender
     virtual bool try_get( T & ) { return false; }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual bool try_get( T &, message_metainfo& ) { return false; }
+#endif
+
     //! Reserves an item in the sender
     virtual bool try_reserve( T & ) { return false; }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual bool try_reserve( T &, message_metainfo& ) { return false; }
+#endif
+
     //! Releases the reserved item
     virtual bool try_release( ) { return false; }
 
@@ -238,17 +278,38 @@ bool remove_successor(sender<C>& s, receiver<C>& r) {
 //! Pure virtual template class that defines a receiver of messages of type T
 template< typename T >
 class receiver {
+private:
+    template <typename... TryPutTaskArgs>
+    bool internal_try_put(const T& t, TryPutTaskArgs&&... args) {
+        graph_task* res = try_put_task(t, std::forward<TryPutTaskArgs>(args)...);
+        if (!res) return false;
+        if (res != SUCCESSFULLY_ENQUEUED) spawn_in_graph_arena(graph_reference(), *res);
+        return true;
+    }
+
 public:
     //! Destructor
     virtual ~receiver() {}
 
     //! Put an item to the receiver
     bool try_put( const T& t ) {
-        graph_task *res = try_put_task(t);
-        if (!res) return false;
-        if (res != SUCCESSFULLY_ENQUEUED) spawn_in_graph_arena(graph_reference(), *res);
-        return true;
+        return internal_try_put(t);
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    //! Put an item to the receiver and wait for completion
+    bool try_put_and_wait( const T& t ) {
+        // Since try_put_and_wait is a blocking call, it is safe to create wait_context on stack
+        d1::wait_context_vertex msg_wait_vertex{};
+
+        bool res = internal_try_put(t, message_metainfo{message_metainfo::waiters_type{&msg_wait_vertex}});
+        if (res) {
+            __TBB_ASSERT(graph_reference().my_context != nullptr, "No wait_context associated with the Flow Graph");
+            wait(msg_wait_vertex.get_context(), *graph_reference().my_context);
+        }
+        return res;
     }
+#endif
 
     //! put item to successor; return task to run the successor if possible.
 protected:
@@ -262,6 +323,9 @@ class receiver {
     template< typename X, typename Y > friend class broadcast_cache;
     template< typename X, typename Y > friend class round_robin_cache;
     virtual graph_task *try_put_task(const T& t) = 0;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual graph_task *try_put_task(const T& t, const message_metainfo&) = 0;
+#endif
     virtual graph& graph_reference() const = 0;
 
     template<typename TT, typename M> friend class successor_cache;
@@ -337,23 +401,61 @@ class continue_receiver : public receiver< continue_msg > {
     template< typename R, typename B > friend class run_and_put_task;
     template<typename X, typename Y> friend class broadcast_cache;
     template<typename X, typename Y> friend class round_robin_cache;
+
+private:
     // execute body is supposed to be too small to create a task for.
-    graph_task* try_put_task( const input_type & ) override {
+    graph_task* try_put_task_impl( const input_type& __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo predecessor_metainfo;
+#endif
         {
             spin_mutex::scoped_lock l(my_mutex);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            // Prolong the wait and store the metainfo until receiving signals from all the predecessors
+            for (auto waiter : metainfo.waiters()) {
+                waiter->reserve(1);
+            }
+            my_current_metainfo.merge(metainfo);
+#endif
             if ( ++my_current_count < my_predecessor_count )
                 return SUCCESSFULLY_ENQUEUED;
-            else
+            else {
                 my_current_count = 0;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                predecessor_metainfo = my_current_metainfo;
+                my_current_metainfo = message_metainfo{};
+#endif
+            }
+        }
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        graph_task* res = execute(predecessor_metainfo);
+        for (auto waiter : predecessor_metainfo.waiters()) {
+            waiter->release(1);
         }
+#else
         graph_task* res = execute();
+#endif
         return res? res : SUCCESSFULLY_ENQUEUED;
     }
 
+protected:
+    graph_task* try_put_task( const input_type& input ) override {
+        return try_put_task_impl(input __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task( const input_type& input, const message_metainfo& metainfo ) override {
+        return try_put_task_impl(input, metainfo);
+    }
+#endif
+
     spin_mutex my_mutex;
     int my_predecessor_count;
     int my_current_count;
     int my_initial_predecessor_count;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    message_metainfo my_current_metainfo;
+#endif
     node_priority_t my_priority;
     // the friend declaration in the base class did not eliminate the "protected class"
     // error in gcc 4.1.2
@@ -369,7 +471,11 @@ class continue_receiver : public receiver< continue_msg > {
     //! Does whatever should happen when the threshold is reached
     /** This should be very fast or else spawn a task.  This is
         called while the sender is blocked in the try_put(). */
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual graph_task* execute(const message_metainfo& metainfo) = 0;
+#else
     virtual graph_task* execute() = 0;
+#endif
     template<typename TT, typename M> friend class successor_cache;
     bool is_continue_receiver() override { return true; }
 
@@ -392,7 +498,7 @@ class continue_receiver : public receiver< continue_msg > {
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 #include "detail/_flow_graph_body_impl.h"
 #include "detail/_flow_graph_cache_impl.h"
@@ -424,7 +530,7 @@ void graph_iterator<C,N>::internal_forward() {
 }
 
 //! Constructs a graph with isolated task_group_context
-inline graph::graph() : my_wait_context(0), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) {
+inline graph::graph() : my_wait_context_vertex(0), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) {
     prepare_task_arena();
     own_context = true;
     cancelled = false;
@@ -435,7 +541,7 @@ inline graph::graph() : my_wait_context(0), my_nodes(nullptr), my_nodes_last(nul
 }
 
 inline graph::graph(task_group_context& use_this_context) :
-    my_wait_context(0), my_context(&use_this_context), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) {
+    my_wait_context_vertex(0), my_context(&use_this_context), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) {
     prepare_task_arena();
     own_context = false;
     cancelled = false;
@@ -454,13 +560,13 @@ inline graph::~graph() {
 }
 
 inline void graph::reserve_wait() {
-    my_wait_context.reserve();
+    my_wait_context_vertex.reserve();
     fgt_reserve_wait(this);
 }
 
 inline void graph::release_wait() {
     fgt_release_wait(this);
-    my_wait_context.release();
+    my_wait_context_vertex.release();
 }
 
 inline void graph::register_node(graph_node *n) {
@@ -633,6 +739,18 @@ class input_node : public graph_node, public sender< Output > {
         }
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+private:
+    bool try_reserve( output_type& v, message_metainfo& ) override {
+        return try_reserve(v);
+    }
+
+    bool try_get( output_type& v, message_metainfo& ) override {
+        return try_get(v);
+    }
+public:
+#endif
+
     //! Release a reserved item.
     /** true = item has been released and so remains in sender, dest must request or reserve future items */
     bool try_release( ) override {
@@ -703,7 +821,7 @@ class input_node : public graph_node, public sender< Output > {
             return false;
         }
         if ( !my_has_cached_item ) {
-            flow_control control;
+            d1::flow_control control;
 
             fgt_begin_body( my_body );
 
@@ -722,10 +840,9 @@ class input_node : public graph_node, public sender< Output > {
     }
 
     graph_task* create_put_task() {
-        small_object_allocator allocator{};
+        d1::small_object_allocator allocator{};
         typedef input_node_task_bypass< input_node<output_type> > task_type;
         graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
-        my_graph.reserve_wait();
         return t;
     }
 
@@ -962,6 +1079,14 @@ class split_node : public graph_node, public receiver<TupleType> {
         // Also, we do not have successors here. So we just tell the task returned here is successful.
         return emit_element<N>::emit_this(this->my_graph, t, output_ports());
     }
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const TupleType& t, const message_metainfo& metainfo) override {
+        // Sending split messages in parallel is not justified, as overheads would prevail.
+        // Also, we do not have successors here. So we just tell the task returned here is successful.
+        return emit_element<N>::emit_this(this->my_graph, t, output_ports(), metainfo);
+    }
+#endif
+
     void reset_node(reset_flags f) override {
         if (f & rf_clear_edges)
             clear_element<N>::clear_this(my_output_ports);
@@ -1119,17 +1244,28 @@ class broadcast_node : public graph_node, public receiver<T>, public sender<T> {
         return true;
     }
 
+private:
+    graph_task* try_put_task_impl(const T& t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
+        graph_task* new_task = my_successors.try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
+        if (!new_task) new_task = SUCCESSFULLY_ENQUEUED;
+        return new_task;
+    }
+
 protected:
     template< typename R, typename B > friend class run_and_put_task;
     template<typename X, typename Y> friend class broadcast_cache;
     template<typename X, typename Y> friend class round_robin_cache;
     //! build a task to run the successor if possible.  Default is old behavior.
-    graph_task *try_put_task(const T& t) override {
-        graph_task *new_task = my_successors.try_put_task(t);
-        if (!new_task) new_task = SUCCESSFULLY_ENQUEUED;
-        return new_task;
+    graph_task* try_put_task(const T& t) override {
+        return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const T& t, const message_metainfo& metainfo) override {
+        return try_put_task_impl(t, metainfo);
+    }
+#endif
+
     graph& graph_reference() const override {
         return my_graph;
     }
@@ -1168,24 +1304,37 @@ class buffer_node
     };
 
     // implements the aggregator_operation concept
-    class buffer_operation : public aggregated_operation< buffer_operation > {
+    class buffer_operation : public d1::aggregated_operation< buffer_operation > {
     public:
         char type;
         T* elem;
         graph_task* ltask;
         successor_type *r;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo* metainfo{ nullptr };
+#endif
 
         buffer_operation(const T& e, op_type t) : type(char(t))
                                                   , elem(const_cast<T*>(&e)) , ltask(nullptr)
                                                   , r(nullptr)
         {}
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        buffer_operation(const T& e, op_type t, const message_metainfo& info)
+            : type(char(t)), elem(const_cast<T*>(&e)), ltask(nullptr), r(nullptr)
+            , metainfo(const_cast<message_metainfo*>(&info))
+        {}
+
+        buffer_operation(op_type t, message_metainfo& info)
+            : type(char(t)), elem(nullptr), ltask(nullptr), r(nullptr), metainfo(&info) {}
+#endif
         buffer_operation(op_type t) : type(char(t)), elem(nullptr), ltask(nullptr), r(nullptr) {}
     };
 
     bool forwarder_busy;
-    typedef aggregating_functor<class_type, buffer_operation> handler_type;
-    friend class aggregating_functor<class_type, buffer_operation>;
-    aggregator< handler_type, buffer_operation> my_aggregator;
+    typedef d1::aggregating_functor<class_type, buffer_operation> handler_type;
+    friend class d1::aggregating_functor<class_type, buffer_operation>;
+    d1::aggregator< handler_type, buffer_operation> my_aggregator;
 
     virtual void handle_operations(buffer_operation *op_list) {
         handle_operations_impl(op_list, this);
@@ -1218,9 +1367,8 @@ class buffer_node
             if(is_graph_active(this->my_graph)) {
                 forwarder_busy = true;
                 typedef forward_task_bypass<class_type> task_type;
-                small_object_allocator allocator{};
+                d1::small_object_allocator allocator{};
                 graph_task* new_task = allocator.new_object<task_type>(graph_reference(), allocator, *this);
-                my_graph.reserve_wait();
                 // tmp should point to the last item handled by the aggregator.  This is the operation
                 // the handling thread enqueued.  So modifying that record will be okay.
                 // TODO revamp: check that the issue is still present
@@ -1286,7 +1434,8 @@ class buffer_node
     }
 
     void try_put_and_add_task(graph_task*& last_task) {
-        graph_task *new_task = my_successors.try_put_task(this->back());
+        graph_task* new_task = my_successors.try_put_task(this->back()
+                                                          __TBB_FLOW_GRAPH_METAINFO_ARG(this->back_metainfo()));
         if (new_task) {
             // workaround for icc bug
             graph& g = this->my_graph;
@@ -1328,14 +1477,25 @@ class buffer_node
 
     virtual bool internal_push(buffer_operation *op) {
         __TBB_ASSERT(op->elem, nullptr);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        __TBB_ASSERT(op->metainfo, nullptr);
+        this->push_back(*(op->elem), (*op->metainfo));
+#else
         this->push_back(*(op->elem));
+#endif
         op->status.store(SUCCEEDED, std::memory_order_release);
         return true;
     }
 
     virtual void internal_pop(buffer_operation *op) {
         __TBB_ASSERT(op->elem, nullptr);
-        if(this->pop_back(*(op->elem))) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool pop_result = op->metainfo ? this->pop_back(*(op->elem), *(op->metainfo))
+                                       : this->pop_back(*(op->elem));
+#else
+        bool pop_result = this->pop_back(*(op->elem));
+#endif
+        if (pop_result) {
             op->status.store(SUCCEEDED, std::memory_order_release);
         }
         else {
@@ -1345,7 +1505,13 @@ class buffer_node
 
     virtual void internal_reserve(buffer_operation *op) {
         __TBB_ASSERT(op->elem, nullptr);
-        if(this->reserve_front(*(op->elem))) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool reserve_result = op->metainfo ? this->reserve_front(*(op->elem), *(op->metainfo))
+                                           : this->reserve_front(*(op->elem));
+#else
+        bool reserve_result = this->reserve_front(*(op->elem));
+#endif
+        if (reserve_result) {
             op->status.store(SUCCEEDED, std::memory_order_release);
         }
         else {
@@ -1403,7 +1569,7 @@ class buffer_node
         It also calls r.remove_predecessor(*this) to remove this node as a predecessor. */
     bool remove_successor( successor_type &r ) override {
         // TODO revamp: investigate why full qualification is necessary here
-        tbb::detail::d1::remove_predecessor(r, *this);
+        tbb::detail::d2::remove_predecessor(r, *this);
         buffer_operation op_data(rem_succ);
         op_data.r = &r;
         my_aggregator.execute(&op_data);
@@ -1425,6 +1591,16 @@ class buffer_node
         return (op_data.status==SUCCEEDED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool try_get( T &v, message_metainfo& metainfo ) override {
+        buffer_operation op_data(req_item, metainfo);
+        op_data.elem = &v;
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return (op_data.status==SUCCEEDED);
+    }
+#endif
+
     //! Reserves an item.
     /**  false = no item can be reserved<BR>
          true = an item is reserved */
@@ -1436,6 +1612,16 @@ class buffer_node
         return (op_data.status==SUCCEEDED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool try_reserve( output_type& v, message_metainfo& metainfo ) override {
+        buffer_operation op_data(res_item, metainfo);
+        op_data.elem = &v;
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return op_data.status==SUCCEEDED;
+    }
+#endif
+
     //! Release a reserved item.
     /**  true = item has been released and so remains in sender */
     bool try_release() override {
@@ -1454,14 +1640,9 @@ class buffer_node
         return true;
     }
 
-protected:
-
-    template< typename R, typename B > friend class run_and_put_task;
-    template<typename X, typename Y> friend class broadcast_cache;
-    template<typename X, typename Y> friend class round_robin_cache;
-    //! receive an item, return a task *if possible
-    graph_task *try_put_task(const T &t) override {
-        buffer_operation op_data(t, put_item);
+private:
+    graph_task* try_put_task_impl(const T& t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
+        buffer_operation op_data(t, put_item __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         my_aggregator.execute(&op_data);
         graph_task *ft = grab_forwarding_task(op_data);
         // sequencer_nodes can return failure (if an item has been previously inserted)
@@ -1479,6 +1660,22 @@ class buffer_node
         return ft;
     }
 
+protected:
+
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    //! receive an item, return a task *if possible
+    graph_task *try_put_task(const T &t) override {
+        return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const T& t, const message_metainfo& metainfo) override {
+        return try_put_task_impl(t, metainfo);
+    }
+#endif
+
     graph& graph_reference() const override {
         return my_graph;
     }
@@ -1511,7 +1708,9 @@ class queue_node : public buffer_node<T> {
     }
 
     void try_put_and_add_task(graph_task*& last_task) {
-        graph_task *new_task = this->my_successors.try_put_task(this->front());
+        graph_task* new_task = this->my_successors.try_put_task(this->front()
+                                                                __TBB_FLOW_GRAPH_METAINFO_ARG(this->front_metainfo()));
+
         if (new_task) {
             // workaround for icc bug
             graph& graph_ref = this->graph_reference();
@@ -1530,7 +1729,14 @@ class queue_node : public buffer_node<T> {
             op->status.store(FAILED, std::memory_order_release);
         }
         else {
-            this->pop_front(*(op->elem));
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            if (op->metainfo) {
+                this->pop_front(*(op->elem), *(op->metainfo));
+            } else
+#endif
+            {
+                this->pop_front(*(op->elem));
+            }
             op->status.store(SUCCEEDED, std::memory_order_release);
         }
     }
@@ -1539,7 +1745,15 @@ class queue_node : public buffer_node<T> {
             op->status.store(FAILED, std::memory_order_release);
         }
         else {
-            this->reserve_front(*(op->elem));
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            if (op->metainfo) {
+                this->reserve_front(*(op->elem), *(op->metainfo));
+            }
+            else
+#endif
+            {
+                this->reserve_front(*(op->elem));
+            }
             op->status.store(SUCCEEDED, std::memory_order_release);
         }
     }
@@ -1647,7 +1861,13 @@ class sequencer_node : public queue_node<T> {
         }
         this->my_tail = new_tail;
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        __TBB_ASSERT(op->metainfo, nullptr);
+        bool place_item_result = this->place_item(tag, *(op->elem), *(op->metainfo));
+        const op_stat res = place_item_result ? SUCCEEDED : FAILED;
+#else
         const op_stat res = this->place_item(tag, *(op->elem)) ? SUCCEEDED : FAILED;
+#endif
         op->status.store(res, std::memory_order_release);
         return res ==SUCCEEDED;
     }
@@ -1710,7 +1930,12 @@ class priority_queue_node : public buffer_node<T> {
     }
 
     bool internal_push(prio_operation *op) override {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        __TBB_ASSERT(op->metainfo, nullptr);
+        prio_push(*(op->elem), *(op->metainfo));
+#else
         prio_push(*(op->elem));
+#endif
         op->status.store(SUCCEEDED, std::memory_order_release);
         return true;
     }
@@ -1723,6 +1948,11 @@ class priority_queue_node : public buffer_node<T> {
         }
 
         *(op->elem) = prio();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        if (op->metainfo) {
+            *(op->metainfo) = std::move(prio_metainfo());
+        }
+#endif
         op->status.store(SUCCEEDED, std::memory_order_release);
         prio_pop();
 
@@ -1736,6 +1966,12 @@ class priority_queue_node : public buffer_node<T> {
         }
         this->my_reserved = true;
         *(op->elem) = prio();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        if (op->metainfo) {
+            *(op->metainfo) = std::move(prio_metainfo());
+            reserved_metainfo = *(op->metainfo);
+        }
+#endif
         reserved_item = *(op->elem);
         op->status.store(SUCCEEDED, std::memory_order_release);
         prio_pop();
@@ -1745,13 +1981,27 @@ class priority_queue_node : public buffer_node<T> {
         op->status.store(SUCCEEDED, std::memory_order_release);
         this->my_reserved = false;
         reserved_item = input_type();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        for (auto waiter : reserved_metainfo.waiters()) {
+            waiter->release(1);
+        }
+
+        reserved_metainfo = message_metainfo{};
+#endif
     }
 
     void internal_release(prio_operation *op) override {
         op->status.store(SUCCEEDED, std::memory_order_release);
-        prio_push(reserved_item);
+        prio_push(reserved_item __TBB_FLOW_GRAPH_METAINFO_ARG(reserved_metainfo));
         this->my_reserved = false;
         reserved_item = input_type();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        for (auto waiter : reserved_metainfo.waiters()) {
+            waiter->release(1);
+        }
+
+        reserved_metainfo = message_metainfo{};
+#endif
     }
 
 private:
@@ -1767,7 +2017,8 @@ class priority_queue_node : public buffer_node<T> {
     }
 
     void try_put_and_add_task(graph_task*& last_task) {
-        graph_task * new_task = this->my_successors.try_put_task(this->prio());
+        graph_task* new_task = this->my_successors.try_put_task(this->prio()
+                                                                __TBB_FLOW_GRAPH_METAINFO_ARG(this->prio_metainfo()));
         if (new_task) {
             // workaround for icc bug
             graph& graph_ref = this->graph_reference();
@@ -1781,6 +2032,9 @@ class priority_queue_node : public buffer_node<T> {
     size_type mark;
 
     input_type reserved_item;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    message_metainfo reserved_metainfo;
+#endif
 
     // in case a reheap has not been done after a push, check if the mark item is higher than the 0'th item
     bool prio_use_tail() {
@@ -1789,10 +2043,10 @@ class priority_queue_node : public buffer_node<T> {
     }
 
     // prio_push: checks that the item will fit, expand array if necessary, put at end
-    void prio_push(const T &src) {
+    void prio_push(const T &src __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
         if ( this->my_tail >= this->my_array_size )
             this->grow_my_array( this->my_tail + 1 );
-        (void) this->place_item(this->my_tail, src);
+        (void) this->place_item(this->my_tail, src __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         ++(this->my_tail);
         __TBB_ASSERT(mark < this->my_tail, "mark outside bounds after push");
     }
@@ -1826,6 +2080,12 @@ class priority_queue_node : public buffer_node<T> {
         return this->get_my_item(prio_use_tail() ? this->my_tail-1 : 0);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    message_metainfo& prio_metainfo() {
+        return this->get_my_metainfo(prio_use_tail() ? this->my_tail-1 : 0);
+    }
+#endif
+
     // turn array into heap
     void heapify() {
         if(this->my_tail == 0) {
@@ -1836,7 +2096,10 @@ class priority_queue_node : public buffer_node<T> {
         for (; mark<this->my_tail; ++mark) { // for each unheaped element
             size_type cur_pos = mark;
             input_type to_place;
-            this->fetch_item(mark,to_place);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo metainfo;
+#endif
+            this->fetch_item(mark, to_place __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             do { // push to_place up the heap
                 size_type parent = (cur_pos-1)>>1;
                 if (!compare(this->get_my_item(parent), to_place))
@@ -1844,7 +2107,7 @@ class priority_queue_node : public buffer_node<T> {
                 this->move_item(cur_pos, parent);
                 cur_pos = parent;
             } while( cur_pos );
-            (void) this->place_item(cur_pos, to_place);
+            this->place_item(cur_pos, to_place __TBB_FLOW_GRAPH_METAINFO_ARG(std::move(metainfo)));
         }
     }
 
@@ -1944,9 +2207,12 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
         //SUCCESS
         // if we can reserve and can put, we consume the reservation
         // we increment the count and decrement the tries
-        if ( (my_predecessors.try_reserve(v)) == true ) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo metainfo;
+#endif
+        if ( (my_predecessors.try_reserve(v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) == true ) {
             reserved = true;
-            if ( (rval = my_successors.try_put_task(v)) != nullptr ) {
+            if ( (rval = my_successors.try_put_task(v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) != nullptr ) {
                 {
                     spin_mutex::scoped_lock lock(my_mutex);
                     ++my_count;
@@ -1965,9 +2231,8 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
                     if ( check_conditions() ) {
                         if ( is_graph_active(this->my_graph) ) {
                             typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
-                            small_object_allocator allocator{};
+                            d1::small_object_allocator allocator{};
                             graph_task* rtask = allocator.new_object<task_type>( my_graph, allocator, *this );
-                            my_graph.reserve_wait();
                             spawn_in_graph_arena(graph_reference(), *rtask);
                         }
                     }
@@ -1984,10 +2249,9 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
             if (reserved) my_predecessors.try_release();
             if ( check_conditions() ) {
                 if ( is_graph_active(this->my_graph) ) {
-                    small_object_allocator allocator{};
+                    d1::small_object_allocator allocator{};
                     typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
                     graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
-                    my_graph.reserve_wait();
                     __TBB_ASSERT(!rval, "Have two tasks to handle");
                     return t;
                 }
@@ -2035,10 +2299,9 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
         //spawn a forward task if this is the only successor
         if ( was_empty && !my_predecessors.empty() && my_count + my_tries < my_threshold ) {
             if ( is_graph_active(this->my_graph) ) {
-                small_object_allocator allocator{};
+                d1::small_object_allocator allocator{};
                 typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
                 graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
-                my_graph.reserve_wait();
                 spawn_in_graph_arena(graph_reference(), *t);
             }
         }
@@ -2049,7 +2312,7 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
     /** r.remove_predecessor(*this) is also called. */
     bool remove_successor( successor_type &r ) override {
         // TODO revamp: investigate why qualification is needed for remove_predecessor() call
-        tbb::detail::d1::remove_predecessor(r, *this);
+        tbb::detail::d2::remove_predecessor(r, *this);
         my_successors.remove_successor(r);
         return true;
     }
@@ -2059,10 +2322,9 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
         spin_mutex::scoped_lock lock(my_mutex);
         my_predecessors.add( src );
         if ( my_count + my_tries < my_threshold && !my_successors.empty() && is_graph_active(this->my_graph) ) {
-            small_object_allocator allocator{};
+            d1::small_object_allocator allocator{};
             typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
             graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
-            my_graph.reserve_wait();
             spawn_in_graph_arena(graph_reference(), *t);
         }
         return true;
@@ -2079,8 +2341,10 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
     template< typename R, typename B > friend class run_and_put_task;
     template<typename X, typename Y> friend class broadcast_cache;
     template<typename X, typename Y> friend class round_robin_cache;
+
+private:
     //! Puts an item to this receiver
-    graph_task* try_put_task( const T &t ) override {
+    graph_task* try_put_task_impl( const T &t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) {
         {
             spin_mutex::scoped_lock lock(my_mutex);
             if ( my_count + my_tries >= my_threshold )
@@ -2089,15 +2353,14 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
                 ++my_tries;
         }
 
-        graph_task* rtask = my_successors.try_put_task(t);
+        graph_task* rtask = my_successors.try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         if ( !rtask ) {  // try_put_task failed.
             spin_mutex::scoped_lock lock(my_mutex);
             --my_tries;
             if (check_conditions() && is_graph_active(this->my_graph)) {
-                small_object_allocator allocator{};
+                d1::small_object_allocator allocator{};
                 typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
                 rtask = allocator.new_object<task_type>(my_graph, allocator, *this);
-                my_graph.reserve_wait();
             }
         }
         else {
@@ -2118,6 +2381,16 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
         return rtask;
     }
 
+protected:
+    graph_task* try_put_task(const T& t) override {
+        return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const T& t, const message_metainfo& metainfo) override {
+        return try_put_task_impl(t, metainfo);
+    }
+#endif
+
     graph& graph_reference() const override { return my_graph; }
 
     void reset_node( reset_flags f ) override {
@@ -3054,10 +3327,9 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
                 // because failed reserve does not mean that register_successor is not ready to put a message immediately.
                 // We have some sort of infinite loop: reserving node tries to set pull state for the edge,
                 // but overwrite_node tries to return push state back. That is why we have to break this loop with task creation.
-                small_object_allocator allocator{};
+                d1::small_object_allocator allocator{};
                 typedef register_predecessor_task task_type;
                 graph_task* t = allocator.new_object<task_type>(graph_reference(), allocator, *this, s);
-                graph_reference().reserve_wait();
                 spawn_in_graph_arena( my_graph, *t );
             }
         } else {
@@ -3082,11 +3354,45 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
         return false;
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool try_get( input_type &v, message_metainfo& metainfo ) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        if (my_buffer_is_valid) {
+            v = my_buffer;
+            metainfo = my_buffered_metainfo;
+
+            // Since the successor of the node will use move semantics while wrapping the metainfo
+            // that is designed to transfer the ownership of the value from single-push buffer to the task
+            // It is required to reserve one more reference here because the value keeps in the buffer
+            // and the ownership is not transferred
+            for (auto msg_waiter : metainfo.waiters()) {
+                msg_waiter->reserve(1);
+            }
+            return true;
+        }
+        return false;
+    }
+#endif
+
     //! Reserves an item
     bool try_reserve( T &v ) override {
         return try_get(v);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+private:
+    bool try_reserve(T& v, message_metainfo& metainfo) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        if (my_buffer_is_valid) {
+            v = my_buffer;
+            metainfo = my_buffered_metainfo;
+            return true;
+        }
+        return false;
+    }
+public:
+#endif
+
     //! Releases the reserved item
     bool try_release() override { return true; }
 
@@ -3101,6 +3407,12 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
     void clear() {
        spin_mutex::scoped_lock l( my_mutex );
        my_buffer_is_valid = false;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+       for (auto msg_waiter : my_buffered_metainfo.waiters()) {
+           msg_waiter->release(1);
+       }
+       my_buffered_metainfo = message_metainfo{};
+#endif
     }
 
 protected:
@@ -3110,13 +3422,33 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
     template<typename X, typename Y> friend class round_robin_cache;
     graph_task* try_put_task( const input_type &v ) override {
         spin_mutex::scoped_lock l( my_mutex );
-        return try_put_task_impl(v);
+        return try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const input_type& v, const message_metainfo& metainfo) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        return try_put_task_impl(v, metainfo);
     }
+#endif
 
-    graph_task * try_put_task_impl(const input_type &v) {
+    graph_task * try_put_task_impl(const input_type &v __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
         my_buffer = v;
         my_buffer_is_valid = true;
-        graph_task* rtask = my_successors.try_put_task(v);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        // Since the new item is pushed to the buffer - reserving the waiters
+        for (auto msg_waiter : metainfo.waiters()) {
+            msg_waiter->reserve(1);
+        }
+
+        // Since the item is taken out from the buffer - releasing the stored waiters
+        for (auto msg_waiter : my_buffered_metainfo.waiters()) {
+            msg_waiter->release(1);
+        }
+
+        my_buffered_metainfo = metainfo;
+#endif
+        graph_task* rtask = my_successors.try_put_task(v __TBB_FLOW_GRAPH_METAINFO_ARG(my_buffered_metainfo) );
         if (!rtask) rtask = SUCCESSFULLY_ENQUEUED;
         return rtask;
     }
@@ -3128,13 +3460,13 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
     //! Breaks an infinite loop between the node reservation and register_successor call
     struct register_predecessor_task : public graph_task {
         register_predecessor_task(
-            graph& g, small_object_allocator& allocator, predecessor_type& owner, successor_type& succ)
+            graph& g, d1::small_object_allocator& allocator, predecessor_type& owner, successor_type& succ)
             : graph_task(g, allocator), o(owner), s(succ) {};
 
-        task* execute(execution_data& ed) override {
+        d1::task* execute(d1::execution_data& ed) override {
             // TODO revamp: investigate why qualification is needed for register_successor() call
-            using tbb::detail::d1::register_predecessor;
-            using tbb::detail::d1::register_successor;
+            using tbb::detail::d2::register_predecessor;
+            using tbb::detail::d2::register_successor;
             if ( !register_predecessor(s, o) ) {
                 register_successor(o, s);
             }
@@ -3142,7 +3474,7 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
             return nullptr;
         }
 
-        task* cancel(execution_data& ed) override {
+        d1::task* cancel(d1::execution_data& ed) override {
             finalize<register_predecessor_task>(ed);
             return nullptr;
         }
@@ -3154,6 +3486,9 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
     spin_mutex my_mutex;
     broadcast_cache< input_type, null_rw_mutex > my_successors;
     input_type my_buffer;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    message_metainfo my_buffered_metainfo;
+#endif
     bool my_buffer_is_valid;
 
     void reset_node( reset_flags f) override {
@@ -3200,8 +3535,15 @@ class write_once_node : public overwrite_node<T> {
     template<typename X, typename Y> friend class round_robin_cache;
     graph_task *try_put_task( const T &v ) override {
         spin_mutex::scoped_lock l( this->my_mutex );
-        return this->my_buffer_is_valid ? nullptr : this->try_put_task_impl(v);
+        return this->my_buffer_is_valid ? nullptr : this->try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
     }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const T& v, const message_metainfo& metainfo) override {
+        spin_mutex::scoped_lock l( this->my_mutex );
+        return this->my_buffer_is_valid ? nullptr : this->try_put_task_impl(v, metainfo);
+    }
+#endif
 }; // write_once_node
 
 inline void set_name(const graph& g, const char *name) {
@@ -3293,7 +3635,7 @@ inline void set_name(const async_node<Input, Output, Policy>& node, const char *
 {
     fgt_multioutput_node_desc(&node, name);
 }
-} // d1
+} // d2
 } // detail
 } // tbb
 
@@ -3304,56 +3646,56 @@ inline void set_name(const async_node<Input, Output, Policy>& node, const char *
 namespace tbb {
 namespace flow {
 inline namespace v1 {
-    using detail::d1::receiver;
-    using detail::d1::sender;
-
-    using detail::d1::serial;
-    using detail::d1::unlimited;
-
-    using detail::d1::reset_flags;
-    using detail::d1::rf_reset_protocol;
-    using detail::d1::rf_reset_bodies;
-    using detail::d1::rf_clear_edges;
-
-    using detail::d1::graph;
-    using detail::d1::graph_node;
-    using detail::d1::continue_msg;
-
-    using detail::d1::input_node;
-    using detail::d1::function_node;
-    using detail::d1::multifunction_node;
-    using detail::d1::split_node;
-    using detail::d1::output_port;
-    using detail::d1::indexer_node;
-    using detail::d1::tagged_msg;
-    using detail::d1::cast_to;
-    using detail::d1::is_a;
-    using detail::d1::continue_node;
-    using detail::d1::overwrite_node;
-    using detail::d1::write_once_node;
-    using detail::d1::broadcast_node;
-    using detail::d1::buffer_node;
-    using detail::d1::queue_node;
-    using detail::d1::sequencer_node;
-    using detail::d1::priority_queue_node;
-    using detail::d1::limiter_node;
-    using namespace detail::d1::graph_policy_namespace;
-    using detail::d1::join_node;
-    using detail::d1::input_port;
-    using detail::d1::copy_body;
-    using detail::d1::make_edge;
-    using detail::d1::remove_edge;
-    using detail::d1::tag_value;
-    using detail::d1::composite_node;
-    using detail::d1::async_node;
-    using detail::d1::node_priority_t;
-    using detail::d1::no_priority;
+    using detail::d2::receiver;
+    using detail::d2::sender;
+
+    using detail::d2::serial;
+    using detail::d2::unlimited;
+
+    using detail::d2::reset_flags;
+    using detail::d2::rf_reset_protocol;
+    using detail::d2::rf_reset_bodies;
+    using detail::d2::rf_clear_edges;
+
+    using detail::d2::graph;
+    using detail::d2::graph_node;
+    using detail::d2::continue_msg;
+
+    using detail::d2::input_node;
+    using detail::d2::function_node;
+    using detail::d2::multifunction_node;
+    using detail::d2::split_node;
+    using detail::d2::output_port;
+    using detail::d2::indexer_node;
+    using detail::d2::tagged_msg;
+    using detail::d2::cast_to;
+    using detail::d2::is_a;
+    using detail::d2::continue_node;
+    using detail::d2::overwrite_node;
+    using detail::d2::write_once_node;
+    using detail::d2::broadcast_node;
+    using detail::d2::buffer_node;
+    using detail::d2::queue_node;
+    using detail::d2::sequencer_node;
+    using detail::d2::priority_queue_node;
+    using detail::d2::limiter_node;
+    using namespace detail::d2::graph_policy_namespace;
+    using detail::d2::join_node;
+    using detail::d2::input_port;
+    using detail::d2::copy_body;
+    using detail::d2::make_edge;
+    using detail::d2::remove_edge;
+    using detail::d2::tag_value;
+    using detail::d2::composite_node;
+    using detail::d2::async_node;
+    using detail::d2::node_priority_t;
+    using detail::d2::no_priority;
 
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
-    using detail::d1::follows;
-    using detail::d1::precedes;
-    using detail::d1::make_node_set;
-    using detail::d1::make_edges;
+    using detail::d2::follows;
+    using detail::d2::precedes;
+    using detail::d2::make_node_set;
+    using detail::d2::make_edges;
 #endif
 
 } // v1
@@ -3362,7 +3704,7 @@ inline namespace v1 {
     using detail::d1::flow_control;
 
 namespace profiling {
-    using detail::d1::set_name;
+    using detail::d2::set_name;
 } // profiling
 
 } // tbb
diff --git a/include/oneapi/tbb/flow_graph_abstractions.h b/include/oneapi/tbb/flow_graph_abstractions.h
index 121f167c4d..329e75c43e 100644
--- a/include/oneapi/tbb/flow_graph_abstractions.h
+++ b/include/oneapi/tbb/flow_graph_abstractions.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 //! Pure virtual template classes that define interfaces for async communication
 class graph_proxy {
@@ -43,7 +43,7 @@ class receiver_gateway : public graph_proxy {
     virtual bool try_put(const input_type&) = 0;
 };
 
-} // d1
+} // d2
 
 
 } // detail
diff --git a/include/oneapi/tbb/memory_pool.h b/include/oneapi/tbb/memory_pool.h
index b2e6b05191..5ece879002 100644
--- a/include/oneapi/tbb/memory_pool.h
+++ b/include/oneapi/tbb/memory_pool.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -97,10 +97,10 @@ class memory_pool_allocator {
         typedef memory_pool_allocator<U, P> other;
     };
 
-    explicit memory_pool_allocator(pool_type &pool) throw() : my_pool(&pool) {}
-    memory_pool_allocator(const memory_pool_allocator& src) throw() : my_pool(src.my_pool) {}
+    explicit memory_pool_allocator(pool_type &pool) noexcept : my_pool(&pool) {}
+    memory_pool_allocator(const memory_pool_allocator& src) noexcept : my_pool(src.my_pool) {}
     template<typename U>
-    memory_pool_allocator(const memory_pool_allocator<U,P>& src) throw() : my_pool(src.my_pool) {}
+    memory_pool_allocator(const memory_pool_allocator<U,P>& src) noexcept : my_pool(src.my_pool) {}
 
     pointer address(reference x) const { return &x; }
     const_pointer address(const_reference x) const { return &x; }
@@ -117,7 +117,7 @@ class memory_pool_allocator {
         my_pool->free(p);
     }
     //! Largest value for which method allocate might succeed.
-    size_type max_size() const throw() {
+    size_type max_size() const noexcept {
         size_type max = static_cast<size_type>(-1) / sizeof (value_type);
         return (max > 0 ? max : 1);
     }
@@ -149,10 +149,10 @@ class memory_pool_allocator<void, P> {
         typedef memory_pool_allocator<U, P> other;
     };
 
-    explicit memory_pool_allocator( pool_type &pool) throw() : my_pool(&pool) {}
-    memory_pool_allocator( const memory_pool_allocator& src) throw() : my_pool(src.my_pool) {}
+    explicit memory_pool_allocator( pool_type &pool) noexcept : my_pool(&pool) {}
+    memory_pool_allocator( const memory_pool_allocator& src) noexcept : my_pool(src.my_pool) {}
     template<typename U>
-    memory_pool_allocator(const memory_pool_allocator<U,P>& src) throw() : my_pool(src.my_pool) {}
+    memory_pool_allocator(const memory_pool_allocator<U,P>& src) noexcept : my_pool(src.my_pool) {}
 
 protected:
     pool_type *my_pool;
diff --git a/include/oneapi/tbb/parallel_for_each.h b/include/oneapi/tbb/parallel_for_each.h
index ab0b345388..85c0269196 100644
--- a/include/oneapi/tbb/parallel_for_each.h
+++ b/include/oneapi/tbb/parallel_for_each.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -118,14 +118,17 @@ struct feeder_item_task: public task {
     using feeder_type = feeder_impl<Body, Item>;
 
     template <typename ItemType>
-    feeder_item_task(ItemType&& input_item, feeder_type& feeder, small_object_allocator& alloc) :
+    feeder_item_task(ItemType&& input_item, feeder_type& feeder, small_object_allocator& alloc, wait_tree_vertex_interface& wait_vertex) :
         item(std::forward<ItemType>(input_item)),
         my_feeder(feeder),
-        my_allocator(alloc)
-    {}
+        my_allocator(alloc),
+        m_wait_tree_vertex(r1::get_thread_reference_vertex(&wait_vertex))
+    {
+        m_wait_tree_vertex->reserve();
+    }
 
     void finalize(const execution_data& ed) {
-        my_feeder.my_wait_context.release();
+        m_wait_tree_vertex->release();
         my_allocator.delete_object(this, ed);
     }
 
@@ -160,6 +163,7 @@ struct feeder_item_task: public task {
     Item item;
     feeder_type& my_feeder;
     small_object_allocator my_allocator;
+    wait_tree_vertex_interface* m_wait_tree_vertex;
 }; // class feeder_item_task
 
 /** Implements new task adding procedure.
@@ -170,9 +174,8 @@ class feeder_impl : public feeder<Item> {
     void internal_add_copy_impl(std::true_type, const Item& item) {
         using feeder_task = feeder_item_task<Body, Item>;
         small_object_allocator alloc;
-        auto task = alloc.new_object<feeder_task>(item, *this, alloc);
+        auto task = alloc.new_object<feeder_task>(item, *this, alloc, my_wait_context);
 
-        my_wait_context.reserve();
         spawn(*task, my_execution_context);
     }
 
@@ -187,20 +190,19 @@ class feeder_impl : public feeder<Item> {
     void internal_add_move(Item&& item) override {
         using feeder_task = feeder_item_task<Body, Item>;
         small_object_allocator alloc{};
-        auto task = alloc.new_object<feeder_task>(std::move(item), *this, alloc);
+        auto task = alloc.new_object<feeder_task>(std::move(item), *this, alloc, my_wait_context);
 
-        my_wait_context.reserve();
         spawn(*task, my_execution_context);
     }
 public:
-    feeder_impl(const Body& body, wait_context& w_context, task_group_context &context)
+    feeder_impl(const Body& body, wait_context_vertex& w_context, task_group_context &context)
       : my_body(body),
         my_wait_context(w_context)
       , my_execution_context(context)
     {}
 
     const Body& my_body;
-    wait_context& my_wait_context;
+    wait_context_vertex& my_wait_context;
     task_group_context& my_execution_context;
 }; // class feeder_impl
 
@@ -263,7 +265,7 @@ struct input_block_handling_task : public task {
     using iteration_task_iterator_type = typename input_iteration_task_iterator_helper<Body, Item>::type;
     using iteration_task = for_each_iteration_task<iteration_task_iterator_type, Body, Item>;
 
-    input_block_handling_task(wait_context& root_wait_context, task_group_context& e_context,
+    input_block_handling_task(wait_context_vertex& root_wait_context, task_group_context& e_context,
                               const Body& body, feeder_impl<Body, Item>* feeder_ptr, small_object_allocator& alloc)
         :my_size(0), my_wait_context(0), my_root_wait_context(root_wait_context),
          my_execution_context(e_context), my_allocator(alloc)
@@ -312,7 +314,7 @@ struct input_block_handling_task : public task {
     aligned_space<iteration_task, max_block_size> task_pool;
     std::size_t my_size;
     wait_context my_wait_context;
-    wait_context& my_root_wait_context;
+    wait_context_vertex& my_root_wait_context;
     task_group_context& my_execution_context;
     small_object_allocator my_allocator;
 }; // class input_block_handling_task
@@ -326,7 +328,7 @@ struct forward_block_handling_task : public task {
     using iteration_task = for_each_iteration_task<Iterator, Body, Item>;
 
     forward_block_handling_task(Iterator first, std::size_t size,
-                                wait_context& w_context, task_group_context& e_context,
+                                wait_context_vertex& w_context, task_group_context& e_context,
                                 const Body& body, feeder_impl<Body, Item>* feeder_ptr,
                                 small_object_allocator& alloc)
         : my_size(size), my_wait_context(0), my_root_wait_context(w_context),
@@ -373,7 +375,7 @@ struct forward_block_handling_task : public task {
     aligned_space<iteration_task, max_block_size> task_pool;
     std::size_t my_size;
     wait_context my_wait_context;
-    wait_context& my_root_wait_context;
+    wait_context_vertex& my_root_wait_context;
     task_group_context& my_execution_context;
     small_object_allocator my_allocator;
 }; // class forward_block_handling_task
@@ -456,7 +458,7 @@ using feeder_is_required = tbb::detail::void_t<decltype(tbb::detail::invoke(std:
 // Creates feeder object only if the body can accept it
 template <typename Iterator, typename Body, typename Item, typename = void>
 struct feeder_holder {
-    feeder_holder( wait_context&, task_group_context&, const Body& ) {}
+    feeder_holder( wait_context_vertex&, task_group_context&, const Body& ) {}
 
     feeder_impl<Body, Item>* feeder_ptr() { return nullptr; }
 }; // class feeder_holder
@@ -464,7 +466,7 @@ struct feeder_holder {
 template <typename Iterator, typename Body, typename Item>
 class feeder_holder<Iterator, Body, Item, feeder_is_required<Body, Iterator, Item>> {
 public:
-    feeder_holder( wait_context& w_context, task_group_context& context, const Body& body )
+    feeder_holder( wait_context_vertex& w_context, task_group_context& context, const Body& body )
         : my_feeder(body, w_context, context) {}
 
     feeder_impl<Body, Item>* feeder_ptr() { return &my_feeder; }
@@ -475,7 +477,7 @@ class feeder_holder<Iterator, Body, Item, feeder_is_required<Body, Iterator, Ite
 template <typename Iterator, typename Body, typename Item>
 class for_each_root_task_base : public task {
 public:
-    for_each_root_task_base(Iterator first, Iterator last, const Body& body, wait_context& w_context, task_group_context& e_context)
+    for_each_root_task_base(Iterator first, Iterator last, const Body& body, wait_context_vertex& w_context, task_group_context& e_context)
         : my_first(first), my_last(last), my_wait_context(w_context), my_execution_context(e_context),
           my_body(body), my_feeder_holder(my_wait_context, my_execution_context, my_body)
     {
@@ -489,7 +491,7 @@ class for_each_root_task_base : public task {
 protected:
     Iterator my_first;
     Iterator my_last;
-    wait_context& my_wait_context;
+    wait_context_vertex& my_wait_context;
     task_group_context& my_execution_context;
     const Body& my_body;
     feeder_holder<Iterator, Body, Item> my_feeder_holder;
@@ -624,11 +626,11 @@ void run_parallel_for_each( Iterator first, Iterator last, const Body& body, tas
 {
     if (!(first == last)) {
         using ItemType = get_item_type<Body, typename std::iterator_traits<Iterator>::value_type>;
-        wait_context w_context(0);
+        wait_context_vertex w_context(0);
 
         for_each_root_task<Iterator, Body, ItemType> root_task(first, last, body, w_context, context);
 
-        execute_and_wait(root_task, context, w_context, context);
+        execute_and_wait(root_task, context, w_context.get_context(), context);
     }
 }
 
diff --git a/include/oneapi/tbb/task_group.h b/include/oneapi/tbb/task_group.h
index 04e241f607..c0811c8502 100644
--- a/include/oneapi/tbb/task_group.h
+++ b/include/oneapi/tbb/task_group.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -45,7 +45,6 @@ namespace d1 {
 class delegate_base;
 class task_arena_base;
 class task_group_context;
-class task_group_base;
 }
 
 namespace r1 {
@@ -97,8 +96,8 @@ class function_task : public task_handle_task  {
     }
 public:
     template<typename FF>
-    function_task(FF&& f, d1::wait_context& wo, d1::task_group_context& ctx, d1::small_object_allocator& alloc)
-        : task_handle_task{wo, ctx, alloc},
+    function_task(FF&& f, d1::wait_tree_vertex_interface* vertex, d1::task_group_context& ctx, d1::small_object_allocator& alloc)
+        : task_handle_task{vertex, ctx, alloc},
           m_func(std::forward<FF>(f)) {}
 };
 
@@ -414,11 +413,20 @@ class task_group_context : no_copy {
     friend class r1::context_guard_helper;
     friend struct r1::task_arena_impl;
     friend struct r1::task_group_context_impl;
-    friend class task_group_base;
+    friend class d2::task_group_base;
 }; // class task_group_context
 
 static_assert(sizeof(task_group_context) == 128, "Wrong size of task_group_context");
 
+inline bool is_current_task_group_canceling() {
+    task_group_context* ctx = current_context();
+    return ctx ? ctx->is_group_execution_cancelled() : false;
+}
+
+} // namespace d1
+
+namespace d2 {
+
 enum task_group_status {
     not_complete,
     complete,
@@ -431,77 +439,41 @@ class structured_task_group;
 class isolated_task_group;
 #endif
 
-template<typename F>
-class function_task : public task {
-    const F m_func;
-    wait_context& m_wait_ctx;
-    small_object_allocator m_allocator;
-
-    void finalize(const execution_data& ed) {
-        // Make a local reference not to access this after destruction.
-        wait_context& wo = m_wait_ctx;
-        // Copy allocator to the stack
-        auto allocator = m_allocator;
-        // Destroy user functor before release wait.
-        this->~function_task();
-        wo.release();
-
-        allocator.deallocate(this, ed);
-    }
-    task* execute(execution_data& ed) override {
-        task* res = d2::task_ptr_or_nullptr(m_func);
-        finalize(ed);
-        return res;
-    }
-    task* cancel(execution_data& ed) override {
-        finalize(ed);
-        return nullptr;
-    }
-public:
-    function_task(const F& f, wait_context& wo, small_object_allocator& alloc)
-        : m_func(f)
-        , m_wait_ctx(wo)
-        , m_allocator(alloc) {}
-
-    function_task(F&& f, wait_context& wo, small_object_allocator& alloc)
-        : m_func(std::move(f))
-        , m_wait_ctx(wo)
-        , m_allocator(alloc) {}
-};
-
 template <typename F>
-class function_stack_task : public task {
+class function_stack_task : public d1::task {
     const F& m_func;
-    wait_context& m_wait_ctx;
+    d1::wait_tree_vertex_interface* m_wait_tree_vertex;
 
     void finalize() {
-        m_wait_ctx.release();
+        m_wait_tree_vertex->release();
     }
-    task* execute(execution_data&) override {
+    task* execute(d1::execution_data&) override {
         task* res = d2::task_ptr_or_nullptr(m_func);
         finalize();
         return res;
     }
-    task* cancel(execution_data&) override {
+    task* cancel(d1::execution_data&) override {
         finalize();
         return nullptr;
     }
 public:
-    function_stack_task(const F& f, wait_context& wo) : m_func(f), m_wait_ctx(wo) {}
+    function_stack_task(const F& f, d1::wait_tree_vertex_interface* vertex) : m_func(f), m_wait_tree_vertex(vertex) {
+        m_wait_tree_vertex->reserve();
+    }
 };
 
 class task_group_base : no_copy {
 protected:
-    wait_context m_wait_ctx;
-    task_group_context m_context;
+    d1::wait_context_vertex m_wait_vertex;
+    d1::task_group_context m_context;
 
     template<typename F>
     task_group_status internal_run_and_wait(const F& f) {
-        function_stack_task<F> t{ f, m_wait_ctx };
-        m_wait_ctx.reserve();
+        function_stack_task<F> t{ f, r1::get_thread_reference_vertex(&m_wait_vertex) };
+
         bool cancellation_status = false;
         try_call([&] {
-            execute_and_wait(t, context(), m_wait_ctx, context());
+            execute_and_wait(t, context(), m_wait_vertex.get_context(), context());
         }).on_completion([&] {
             // TODO: the reset method is not thread-safe. Ensure the correct behavior.
             cancellation_status = context().is_group_execution_cancelled();
@@ -518,7 +490,7 @@ class task_group_base : no_copy {
 
         bool cancellation_status = false;
         try_call([&] {
-            execute_and_wait(*acs::release(h), context(), m_wait_ctx, context());
+            execute_and_wait(*acs::release(h), context(), m_wait_vertex.get_context(), context());
         }).on_completion([&] {
             // TODO: the reset method is not thread-safe. Ensure the correct behavior.
             cancellation_status = context().is_group_execution_cancelled();
@@ -528,39 +500,39 @@ class task_group_base : no_copy {
     }
 
     template<typename F>
-    task* prepare_task(F&& f) {
-        m_wait_ctx.reserve();
-        small_object_allocator alloc{};
-        return alloc.new_object<function_task<typename std::decay<F>::type>>(std::forward<F>(f), m_wait_ctx, alloc);
+    d1::task* prepare_task(F&& f) {
+        d1::small_object_allocator alloc{};
+        return alloc.new_object<function_task<typename std::decay<F>::type>>(std::forward<F>(f),
+            r1::get_thread_reference_vertex(&m_wait_vertex), context(), alloc);
     }
 
-    task_group_context& context() noexcept {
+    d1::task_group_context& context() noexcept {
         return m_context.actual_context();
     }
 
     template<typename F>
     d2::task_handle prepare_task_handle(F&& f) {
-        m_wait_ctx.reserve();
-        small_object_allocator alloc{};
+        d1::small_object_allocator alloc{};
         using function_task_t =  d2::function_task<typename std::decay<F>::type>;
-        d2::task_handle_task* function_task_p =  alloc.new_object<function_task_t>(std::forward<F>(f), m_wait_ctx, context(), alloc);
+        d2::task_handle_task* function_task_p =  alloc.new_object<function_task_t>(std::forward<F>(f),
+            r1::get_thread_reference_vertex(&m_wait_vertex), context(), alloc);
 
         return d2::task_handle_accessor::construct(function_task_p);
     }
 
 public:
     task_group_base(uintptr_t traits = 0)
-        : m_wait_ctx(0)
-        , m_context(task_group_context::bound, task_group_context::default_traits | traits)
+        : m_wait_vertex(0)
+        , m_context(d1::task_group_context::bound, d1::task_group_context::default_traits | traits)
     {}
 
-    task_group_base(task_group_context& ctx)
-        : m_wait_ctx(0)
+    task_group_base(d1::task_group_context& ctx)
+        : m_wait_vertex(0)
         , m_context(&ctx)
     {}
 
     ~task_group_base() noexcept(false) {
-        if (m_wait_ctx.continue_execution()) {
+        if (m_wait_vertex.continue_execution()) {
 #if __TBB_CPP17_UNCAUGHT_EXCEPTIONS_PRESENT
             bool stack_unwinding_in_progress = std::uncaught_exceptions() > 0;
 #else
@@ -570,7 +542,7 @@ class task_group_base : no_copy {
             // in case of missing wait (for the sake of better testability & debuggability)
             if (!context().is_group_execution_cancelled())
                 cancel();
-            d1::wait(m_wait_ctx, context());
+            d1::wait(m_wait_vertex.get_context(), context());
             if (!stack_unwinding_in_progress)
                 throw_exception(exception_id::missing_wait);
         }
@@ -579,7 +551,7 @@ class task_group_base : no_copy {
     task_group_status wait() {
         bool cancellation_status = false;
         try_call([&] {
-            d1::wait(m_wait_ctx, context());
+            d1::wait(m_wait_vertex.get_context(), context());
         }).on_completion([&] {
             // TODO: the reset method is not thread-safe. Ensure the correct behavior.
             cancellation_status = m_context.is_group_execution_cancelled();
@@ -595,12 +567,12 @@ class task_group_base : no_copy {
 
 class task_group : public task_group_base {
 public:
-    task_group() : task_group_base(task_group_context::concurrent_wait) {}
-    task_group(task_group_context& ctx) : task_group_base(ctx) {}
+    task_group() : task_group_base(d1::task_group_context::concurrent_wait) {}
+    task_group(d1::task_group_context& ctx) : task_group_base(ctx) {}
 
     template<typename F>
     void run(F&& f) {
-        spawn(*prepare_task(std::forward<F>(f)), context());
+        d1::spawn(*prepare_task(std::forward<F>(f)), context());
     }
 
     void run(d2::task_handle&& h) {
@@ -609,7 +581,7 @@ class task_group : public task_group_base {
         using acs = d2::task_handle_accessor;
         __TBB_ASSERT(&acs::ctx_of(h) == &context(), "Attempt to schedule task_handle into different task_group");
 
-        spawn(*acs::release(h), context());
+        d1::spawn(*acs::release(h), context());
     }
 
     template<typename F>
@@ -629,20 +601,20 @@ class task_group : public task_group_base {
 }; // class task_group
 
 #if TBB_PREVIEW_ISOLATED_TASK_GROUP
-class spawn_delegate : public delegate_base {
-    task* task_to_spawn;
-    task_group_context& context;
+class spawn_delegate : public d1::delegate_base {
+    d1::task* task_to_spawn;
+    d1::task_group_context& context;
     bool operator()() const override {
         spawn(*task_to_spawn, context);
         return true;
     }
 public:
-    spawn_delegate(task* a_task, task_group_context& ctx)
+    spawn_delegate(d1::task* a_task, d1::task_group_context& ctx)
         : task_to_spawn(a_task), context(ctx)
     {}
 };
 
-class wait_delegate : public delegate_base {
+class wait_delegate : public d1::delegate_base {
     bool operator()() const override {
         status = tg.wait();
         return true;
@@ -674,7 +646,7 @@ class isolated_task_group : public task_group {
 public:
     isolated_task_group() : task_group() {}
 
-    isolated_task_group(task_group_context& ctx) : task_group(ctx) {}
+    isolated_task_group(d1::task_group_context& ctx) : task_group(ctx) {}
 
     template<typename F>
     void run(F&& f) {
@@ -710,26 +682,20 @@ class isolated_task_group : public task_group {
     }
 }; // class isolated_task_group
 #endif // TBB_PREVIEW_ISOLATED_TASK_GROUP
-
-inline bool is_current_task_group_canceling() {
-    task_group_context* ctx = current_context();
-    return ctx ? ctx->is_group_execution_cancelled() : false;
-}
-
-} // namespace d1
+} // namespace d2
 } // namespace detail
 
 inline namespace v1 {
 using detail::d1::task_group_context;
-using detail::d1::task_group;
+using detail::d2::task_group;
 #if TBB_PREVIEW_ISOLATED_TASK_GROUP
-using detail::d1::isolated_task_group;
+using detail::d2::isolated_task_group;
 #endif
 
-using detail::d1::task_group_status;
-using detail::d1::not_complete;
-using detail::d1::complete;
-using detail::d1::canceled;
+using detail::d2::task_group_status;
+using detail::d2::not_complete;
+using detail::d2::complete;
+using detail::d2::canceled;
 
 using detail::d1::is_current_task_group_canceling;
 using detail::r1::missing_wait;
diff --git a/include/oneapi/tbb/version.h b/include/oneapi/tbb/version.h
index fff3e7e2f9..c8f3ad50e3 100644
--- a/include/oneapi/tbb/version.h
+++ b/include/oneapi/tbb/version.h
@@ -27,9 +27,9 @@
 #endif
 
 // Product version
-#define TBB_VERSION_MAJOR 2021
+#define TBB_VERSION_MAJOR 2022
 // Update version
-#define TBB_VERSION_MINOR 13
+#define TBB_VERSION_MINOR 0
 // "Patch" version for custom releases
 #define TBB_VERSION_PATCH 0
 // Suffix string
@@ -44,7 +44,7 @@
 // OneAPI oneTBB specification version
 #define ONETBB_SPEC_VERSION "1.0"
 // Full interface version
-#define TBB_INTERFACE_VERSION 12130
+#define TBB_INTERFACE_VERSION 12140
 // Major interface version
 #define TBB_INTERFACE_VERSION_MAJOR (TBB_INTERFACE_VERSION/1000)
 // Minor interface version
diff --git a/src/tbb/CMakeLists.txt b/src/tbb/CMakeLists.txt
index b996c736a7..8c84a0b29b 100644
--- a/src/tbb/CMakeLists.txt
+++ b/src/tbb/CMakeLists.txt
@@ -126,6 +126,25 @@ target_link_libraries(tbb
     ${TBB_COMMON_LINK_LIBS}
 )
 
+# Strip debug symbols into a separate .dbg file
+if(TBB_LINUX_SEPARATE_DBG)
+    if(NOT CMAKE_BUILD_TYPE STREQUAL "release")
+        find_program(OBJCOPY_COMMAND objcopy)
+        if(NOT OBJCOPY_COMMAND)
+            message(WARNING "objcopy command not found in the system")
+        else()
+            add_custom_command(TARGET tbb POST_BUILD
+                COMMAND objcopy --only-keep-debug $<TARGET_FILE:tbb> $<TARGET_FILE:tbb>.dbg
+                COMMAND objcopy --strip-debug $<TARGET_FILE:tbb>
+                COMMAND objcopy --add-gnu-debuglink=$<TARGET_FILE:tbb>.dbg $<TARGET_FILE:tbb>
+                COMMENT "Creating and associating .dbg file with tbb"
+            )
+        endif()
+    else()
+        message(WARNING " TBB_LINUX_SEPARATE_DBG flag is not used on release config")
+    endif()
+endif()
+
 if(TBB_BUILD_APPLE_FRAMEWORKS)
     set_target_properties(tbb PROPERTIES
         FRAMEWORK TRUE
@@ -158,7 +177,13 @@ if (TBB_INSTALL)
                 COMPONENT devel
         )
     endif()
-
+    if(TBB_LINUX_SEPARATE_DBG)
+        install(FILES
+                $<TARGET_FILE:tbb>.dbg
+                DESTINATION lib
+                COMPONENT devel
+        )
+    endif()
     set(_tbb_pc_lib_name tbb)
 
     if (WIN32)
diff --git a/src/tbb/allocator.cpp b/src/tbb/allocator.cpp
index 888f43fd33..689c51255d 100644
--- a/src/tbb/allocator.cpp
+++ b/src/tbb/allocator.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -157,6 +157,14 @@ void initialize_cache_aligned_allocator() {
 }
 
 //! Executed on very first call through allocate_handler
+/** Only one of initialize_allocate_handler() and initialize_cache_aligned_allocate_handler()
+    is called, since each one of them also initializes the other.
+
+    In the current implementation of oneTBB library initialization, cache_aligned_allocate() is
+    used, which in turn calls initialize_cache_aligned_allocate_handler(). As mentioned above,
+    that also initializes the regular allocate_handler.
+
+    Therefore, initialize_allocate_handler() is not called in the current library implementation. */
 static void* initialize_allocate_handler(std::size_t size) {
     initialize_cache_aligned_allocator();
     __TBB_ASSERT(allocate_handler != &initialize_allocate_handler, nullptr);
diff --git a/src/tbb/arena.cpp b/src/tbb/arena.cpp
index 0e7cf43c3b..6ca062d02f 100644
--- a/src/tbb/arena.cpp
+++ b/src/tbb/arena.cpp
@@ -195,8 +195,6 @@ void arena::process(thread_data& tls) {
         return;
     }
 
-    my_tc_client.get_pm_client()->register_thread();
-
     __TBB_ASSERT( index >= my_num_reserved_slots, "Workers cannot occupy reserved slots" );
     tls.attach_arena(*this, index);
     // worker thread enters the dispatch loop to look for a work
@@ -236,8 +234,6 @@ void arena::process(thread_data& tls) {
     __TBB_ASSERT(tls.my_inbox.is_idle_state(true), nullptr);
     __TBB_ASSERT(is_alive(my_guard), nullptr);
 
-    my_tc_client.get_pm_client()->unregister_thread();
-
     // In contrast to earlier versions of TBB (before 3.0 U5) now it is possible
     // that arena may be temporarily left unpopulated by threads. See comments in
     // arena::on_thread_leaving() for more details.
@@ -503,6 +499,7 @@ struct task_arena_impl {
     static void wait(d1::task_arena_base&);
     static int max_concurrency(const d1::task_arena_base*);
     static void enqueue(d1::task&, d1::task_group_context*, d1::task_arena_base*);
+    static d1::slot_id execution_slot(const d1::task_arena_base&);
 };
 
 void __TBB_EXPORTED_FUNC initialize(d1::task_arena_base& ta) {
@@ -533,6 +530,10 @@ void __TBB_EXPORTED_FUNC enqueue(d1::task& t, d1::task_group_context& ctx, d1::t
     task_arena_impl::enqueue(t, &ctx, ta);
 }
 
+d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::task_arena_base& arena) {
+    return task_arena_impl::execution_slot(arena);
+}
+
 void task_arena_impl::initialize(d1::task_arena_base& ta) {
     // Enforce global market initialization to properly initialize soft limit
     (void)governor::get_thread_data();
@@ -559,7 +560,7 @@ void task_arena_impl::initialize(d1::task_arena_base& ta) {
         ta.my_numa_id, ta.core_type(), ta.max_threads_per_core());
     if (observer) {
         // TODO: Consider lazy initialization for internal arena so
-        // the direct calls to observer might be omitted until actual initialization. 
+        // the direct calls to observer might be omitted until actual initialization.
         observer->on_scheduler_entry(true);
     }
 #endif /*__TBB_CPUBIND_PRESENT*/
@@ -624,6 +625,14 @@ void task_arena_impl::enqueue(d1::task& t, d1::task_group_context* c, d1::task_a
      a->enqueue_task(t, *ctx, *td);
 }
 
+d1::slot_id task_arena_impl::execution_slot(const d1::task_arena_base& ta) {
+    thread_data* td = governor::get_thread_data_if_initialized();
+    if (td && (td->is_attached_to(ta.my_arena.load(std::memory_order_relaxed)))) {
+        return td->my_arena_index;
+    }
+    return d1::slot_id(-1);
+}
+
 class nested_arena_context : no_copy {
 public:
     nested_arena_context(thread_data& td, arena& nested_arena, std::size_t slot_index)
@@ -633,9 +642,11 @@ class nested_arena_context : no_copy {
             m_orig_arena = td.my_arena;
             m_orig_slot_index = td.my_arena_index;
             m_orig_last_observer = td.my_last_observer;
+            m_orig_is_thread_registered = td.my_is_registered;
 
             td.detach_task_dispatcher();
             td.attach_arena(nested_arena, slot_index);
+            td.my_is_registered = false;
             if (td.my_inbox.is_idle_state(true))
                 td.my_inbox.set_is_idle(false);
             task_dispatcher& task_disp = td.my_arena_slot->default_task_dispatcher();
@@ -686,7 +697,7 @@ class nested_arena_context : no_copy {
             td.leave_task_dispatcher();
             td.my_arena_slot->release();
             td.my_arena->my_exit_monitors.notify_one(); // do not relax!
-
+            td.my_is_registered = m_orig_is_thread_registered;
             td.attach_arena(*m_orig_arena, m_orig_slot_index);
             td.attach_task_dispatcher(*m_orig_execute_data_ext.task_disp);
             __TBB_ASSERT(td.my_inbox.is_idle_state(false), nullptr);
@@ -702,6 +713,7 @@ class nested_arena_context : no_copy {
     unsigned            m_orig_slot_index{};
     bool                m_orig_fifo_tasks_allowed{};
     bool                m_orig_critical_task_allowed{};
+    bool                m_orig_is_thread_registered{};
 };
 
 class delegated_task : public d1::task {
diff --git a/src/tbb/def/lin32-tbb.def b/src/tbb/def/lin32-tbb.def
index ec03c3aa5c..737e8ec2af 100644
--- a/src/tbb/def/lin32-tbb.def
+++ b/src/tbb/def/lin32-tbb.def
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -77,6 +77,7 @@ _ZN3tbb6detail2r17suspendEPFvPvPNS1_18suspend_point_typeEES2_;
 _ZN3tbb6detail2r16resumeEPNS1_18suspend_point_typeE;
 _ZN3tbb6detail2r121current_suspend_pointEv;
 _ZN3tbb6detail2r114notify_waitersEj;
+_ZN3tbb6detail2r127get_thread_reference_vertexEPNS0_2d126wait_tree_vertex_interfaceE;
 
 /* Task dispatcher (task_dispatcher.cpp) */
 _ZN3tbb6detail2r114execution_slotEPKNS0_2d114execution_dataE;
@@ -105,6 +106,7 @@ _ZN3tbb6detail2r120isolate_within_arenaERNS0_2d113delegate_baseEi;
 _ZN3tbb6detail2r17enqueueERNS0_2d14taskEPNS2_15task_arena_baseE;
 _ZN3tbb6detail2r17enqueueERNS0_2d14taskERNS2_18task_group_contextEPNS2_15task_arena_baseE;
 _ZN3tbb6detail2r14waitERNS0_2d115task_arena_baseE;
+_ZN3tbb6detail2r114execution_slotERKNS0_2d115task_arena_baseE;
 
 /* System topology parsing and threads pinning (governor.cpp) */
 _ZN3tbb6detail2r115numa_node_countEv;
@@ -159,4 +161,3 @@ local:
 /* TODO: fill more precisely */
 *;
 };
-
diff --git a/src/tbb/def/lin64-tbb.def b/src/tbb/def/lin64-tbb.def
index 119eea1348..41aca2e932 100644
--- a/src/tbb/def/lin64-tbb.def
+++ b/src/tbb/def/lin64-tbb.def
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -77,6 +77,7 @@ _ZN3tbb6detail2r17suspendEPFvPvPNS1_18suspend_point_typeEES2_;
 _ZN3tbb6detail2r16resumeEPNS1_18suspend_point_typeE;
 _ZN3tbb6detail2r121current_suspend_pointEv;
 _ZN3tbb6detail2r114notify_waitersEm;
+_ZN3tbb6detail2r127get_thread_reference_vertexEPNS0_2d126wait_tree_vertex_interfaceE;
 
 /* Task dispatcher (task_dispatcher.cpp) */
 _ZN3tbb6detail2r114execution_slotEPKNS0_2d114execution_dataE;
@@ -105,6 +106,7 @@ _ZN3tbb6detail2r120isolate_within_arenaERNS0_2d113delegate_baseEl;
 _ZN3tbb6detail2r17enqueueERNS0_2d14taskEPNS2_15task_arena_baseE;
 _ZN3tbb6detail2r17enqueueERNS0_2d14taskERNS2_18task_group_contextEPNS2_15task_arena_baseE;
 _ZN3tbb6detail2r14waitERNS0_2d115task_arena_baseE;
+_ZN3tbb6detail2r114execution_slotERKNS0_2d115task_arena_baseE;
 
 /* System topology parsing and threads pinning (governor.cpp) */
 _ZN3tbb6detail2r115numa_node_countEv;
diff --git a/src/tbb/def/mac64-tbb.def b/src/tbb/def/mac64-tbb.def
index fcccd7b858..38bc48d30e 100644
--- a/src/tbb/def/mac64-tbb.def
+++ b/src/tbb/def/mac64-tbb.def
@@ -1,4 +1,4 @@
-# Copyright (c) 2005-2021 Intel Corporation
+# Copyright (c) 2005-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -79,6 +79,7 @@ __ZN3tbb6detail2r17suspendEPFvPvPNS1_18suspend_point_typeEES2_
 __ZN3tbb6detail2r16resumeEPNS1_18suspend_point_typeE
 __ZN3tbb6detail2r121current_suspend_pointEv
 __ZN3tbb6detail2r114notify_waitersEm
+__ZN3tbb6detail2r127get_thread_reference_vertexEPNS0_2d126wait_tree_vertex_interfaceE
 
 # Task dispatcher (task_dispatcher.cpp)
 __ZN3tbb6detail2r114execution_slotEPKNS0_2d114execution_dataE
@@ -107,6 +108,7 @@ __ZN3tbb6detail2r120isolate_within_arenaERNS0_2d113delegate_baseEl
 __ZN3tbb6detail2r17enqueueERNS0_2d14taskEPNS2_15task_arena_baseE
 __ZN3tbb6detail2r17enqueueERNS0_2d14taskERNS2_18task_group_contextEPNS2_15task_arena_baseE
 __ZN3tbb6detail2r14waitERNS0_2d115task_arena_baseE
+__ZN3tbb6detail2r114execution_slotERKNS0_2d115task_arena_baseE
 
 # System topology parsing and threads pinning (governor.cpp)
 __ZN3tbb6detail2r115numa_node_countEv
@@ -156,4 +158,3 @@ __ZN3tbb6detail2r121notify_by_address_allEPv
 # Versioning (version.cpp)
 _TBB_runtime_interface_version
 _TBB_runtime_version
-
diff --git a/src/tbb/def/win32-tbb.def b/src/tbb/def/win32-tbb.def
index 6863914028..94b5441701 100644
--- a/src/tbb/def/win32-tbb.def
+++ b/src/tbb/def/win32-tbb.def
@@ -1,4 +1,4 @@
-; Copyright (c) 2005-2021 Intel Corporation
+; Copyright (c) 2005-2024 Intel Corporation
 ;
 ; Licensed under the Apache License, Version 2.0 (the "License");
 ; you may not use this file except in compliance with the License.
@@ -71,6 +71,7 @@ EXPORTS
 ?resume@r1@detail@tbb@@YAXPAUsuspend_point_type@123@@Z
 ?suspend@r1@detail@tbb@@YAXP6AXPAXPAUsuspend_point_type@123@@Z0@Z
 ?notify_waiters@r1@detail@tbb@@YAXI@Z
+?get_thread_reference_vertex@r1@detail@tbb@@YAPAVwait_tree_vertex_interface@d1@23@PAV4523@@Z
 
 ; Task dispatcher (task_dispatcher.cpp)
 ?spawn@r1@detail@tbb@@YAXAAVtask@d1@23@AAVtask_group_context@523@G@Z
@@ -99,6 +100,7 @@ EXPORTS
 ?terminate@r1@detail@tbb@@YAXAAVtask_arena_base@d1@23@@Z
 ?wait@r1@detail@tbb@@YAXAAVtask_arena_base@d1@23@@Z
 ?enqueue@r1@detail@tbb@@YAXAAVtask@d1@23@AAVtask_group_context@523@PAVtask_arena_base@523@@Z
+?execution_slot@r1@detail@tbb@@YAGABVtask_arena_base@d1@23@@Z
 
 ; System topology parsing and threads pinning (governor.cpp)
 ?numa_node_count@r1@detail@tbb@@YAIXZ
diff --git a/src/tbb/def/win64-tbb.def b/src/tbb/def/win64-tbb.def
index 306903c129..96bafc0163 100644
--- a/src/tbb/def/win64-tbb.def
+++ b/src/tbb/def/win64-tbb.def
@@ -1,4 +1,4 @@
-; Copyright (c) 2005-2021 Intel Corporation
+; Copyright (c) 2005-2024 Intel Corporation
 ;
 ; Licensed under the Apache License, Version 2.0 (the "License");
 ; you may not use this file except in compliance with the License.
@@ -71,6 +71,7 @@ EXPORTS
 ?resume@r1@detail@tbb@@YAXPEAUsuspend_point_type@123@@Z
 ?current_suspend_point@r1@detail@tbb@@YAPEAUsuspend_point_type@123@XZ
 ?notify_waiters@r1@detail@tbb@@YAX_K@Z
+?get_thread_reference_vertex@r1@detail@tbb@@YAPEAVwait_tree_vertex_interface@d1@23@PEAV4523@@Z
 
 ; Task dispatcher (task_dispatcher.cpp)
 ?spawn@r1@detail@tbb@@YAXAEAVtask@d1@23@AEAVtask_group_context@523@@Z
@@ -99,6 +100,7 @@ EXPORTS
 ?isolate_within_arena@r1@detail@tbb@@YAXAEAVdelegate_base@d1@23@_J@Z
 ?enqueue@r1@detail@tbb@@YAXAEAVtask@d1@23@PEAVtask_arena_base@523@@Z
 ?enqueue@r1@detail@tbb@@YAXAEAVtask@d1@23@AEAVtask_group_context@523@PEAVtask_arena_base@523@@Z
+?execution_slot@r1@detail@tbb@@YAGAEBVtask_arena_base@d1@23@@Z
 
 ; System topology parsing and threads pinning (governor.cpp)
 ?numa_node_count@r1@detail@tbb@@YAIXZ
diff --git a/src/tbb/global_control.cpp b/src/tbb/global_control.cpp
index 127fc92db3..f45c66b87f 100644
--- a/src/tbb/global_control.cpp
+++ b/src/tbb/global_control.cpp
@@ -104,6 +104,8 @@ class alignas(max_nfs_size) stack_size_control : public control_storage {
             return hi - lo;
         }();
         return ThreadStackSizeDefault;
+#elif defined(EMSCRIPTEN)
+        return __TBB_EMSCRIPTEN_STACK_SIZE;
 #else
         return ThreadStackSize;
 #endif
diff --git a/src/tbb/governor.cpp b/src/tbb/governor.cpp
index 55175196b2..218a2bc533 100644
--- a/src/tbb/governor.cpp
+++ b/src/tbb/governor.cpp
@@ -37,10 +37,18 @@
 #include <atomic>
 #include <algorithm>
 
+#ifdef EMSCRIPTEN
+#include <emscripten/stack.h>
+#endif
+
 namespace tbb {
 namespace detail {
 namespace r1 {
 
+#if TBB_USE_ASSERT
+std::atomic<int> the_observer_proxy_count;
+#endif /* TBB_USE_ASSERT */
+
 void clear_address_waiter_table();
 void global_control_acquire();
 void global_control_release();
@@ -86,6 +94,12 @@ void governor::release_resources () {
         runtime_warning("failed to destroy task scheduler TLS: %s", std::strerror(status));
     clear_address_waiter_table();
 
+#if TBB_USE_ASSERT
+    if (the_observer_proxy_count != 0) {
+            runtime_warning("Leaked %ld observer_proxy objects\n", long(the_observer_proxy_count));
+    }
+#endif /* TBB_USE_ASSERT */
+
     system_topology::destroy();
     dynamic_unlink_all();
     global_control_release();
@@ -145,6 +159,9 @@ static std::uintptr_t get_stack_base(std::size_t stack_size) {
     NT_TIB* pteb = (NT_TIB*)NtCurrentTeb();
     __TBB_ASSERT(&pteb < pteb->StackBase && &pteb > pteb->StackLimit, "invalid stack info in TEB");
     return reinterpret_cast<std::uintptr_t>(pteb->StackBase);
+#elif defined(EMSCRIPTEN)
+    suppress_unused_warning(stack_size);
+    return reinterpret_cast<std::uintptr_t>(emscripten_stack_get_base());
 #else
     // There is no portable way to get stack base address in Posix, so we use
     // non-portable method (on all modern Linux) or the simplified approach
diff --git a/src/tbb/main.cpp b/src/tbb/main.cpp
index 85e759e2f6..f43c33f5b7 100644
--- a/src/tbb/main.cpp
+++ b/src/tbb/main.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -72,21 +72,6 @@ void ITT_DoUnsafeOneTimeInitialization();
 static __TBB_InitOnce __TBB_InitOnceHiddenInstance;
 #endif
 
-#if TBB_USE_ASSERT
-std::atomic<int> the_observer_proxy_count;
-
-struct check_observer_proxy_count {
-    ~check_observer_proxy_count() {
-        if (the_observer_proxy_count != 0) {
-            runtime_warning("Leaked %ld observer_proxy objects\n", long(the_observer_proxy_count));
-        }
-    }
-};
-// The proxy count checker shall be defined after __TBB_InitOnceHiddenInstance to check the count
-// after auto termination.
-static check_observer_proxy_count the_check_observer_proxy_count;
-#endif /* TBB_USE_ASSERT */
-
 //------------------------------------------------------------------------
 // __TBB_InitOnce
 //------------------------------------------------------------------------
diff --git a/src/tbb/scheduler_common.h b/src/tbb/scheduler_common.h
index f9e8a68d37..e4686e1673 100644
--- a/src/tbb/scheduler_common.h
+++ b/src/tbb/scheduler_common.h
@@ -23,6 +23,7 @@
 #include "oneapi/tbb/detail/_machine.h"
 #include "oneapi/tbb/task_group.h"
 #include "oneapi/tbb/cache_aligned_allocator.h"
+#include "oneapi/tbb/tbb_allocator.h"
 #include "itt_notify.h"
 #include "co_context.h"
 #include "misc.h"
@@ -42,6 +43,7 @@
 #include <cstdint>
 #include <exception>
 #include <memory> // unique_ptr
+#include <unordered_map>
 
 //! Mutex type for global locks in the scheduler
 using scheduler_mutex_type = __TBB_SCHEDULER_MUTEX_TYPE;
@@ -395,7 +397,7 @@ struct suspend_point_type {
 
     void finilize_resume() {
         m_stack_state.store(stack_state::active, std::memory_order_relaxed);
-        // Set the suspended state for the stack that we left. If the state is already notified, it means that 
+        // Set the suspended state for the stack that we left. If the state is already notified, it means that
         // someone already tried to resume our previous stack but failed. So, we need to resume it.
         // m_prev_suspend_point might be nullptr when destroying co_context based on threads
         if (m_prev_suspend_point && m_prev_suspend_point->m_stack_state.exchange(stack_state::suspended) == stack_state::notified) {
@@ -474,6 +476,13 @@ class alignas (max_nfs_size) task_dispatcher {
     //! Suspend point (null if this task dispatcher has been never suspended)
     suspend_point_type* m_suspend_point{ nullptr };
 
+    //! Used to improve scalability of d1::wait_context by using per thread reference_counter
+    std::unordered_map<d1::wait_tree_vertex_interface*, d1::reference_vertex*,
+                       std::hash<d1::wait_tree_vertex_interface*>, std::equal_to<d1::wait_tree_vertex_interface*>,
+                       tbb_allocator<std::pair<d1::wait_tree_vertex_interface* const, d1::reference_vertex*>>
+                      >
+        m_reference_vertex_map;
+
     //! Attempt to get a task from the mailbox.
     /** Gets a task only if it has not been executed by its sender or a thief
         that has stolen it from the sender's task pool. Otherwise returns nullptr.
@@ -502,6 +511,14 @@ class alignas (max_nfs_size) task_dispatcher {
             m_suspend_point->~suspend_point_type();
             cache_aligned_deallocate(m_suspend_point);
         }
+
+        for (auto& elem : m_reference_vertex_map) {
+            d1::reference_vertex*& node = elem.second;
+            node->~reference_vertex();
+            cache_aligned_deallocate(node);
+            poison_pointer(node);
+        }
+
         poison_pointer(m_thread_data);
         poison_pointer(m_suspend_point);
     }
diff --git a/src/tbb/task.cpp b/src/tbb/task.cpp
index 08463bf398..84b4278f0a 100644
--- a/src/tbb/task.cpp
+++ b/src/tbb/task.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -221,7 +221,37 @@ void notify_waiters(std::uintptr_t wait_ctx_addr) {
     governor::get_thread_data()->my_arena->get_waiting_threads_monitor().notify(is_related_wait_ctx);
 }
 
+d1::wait_tree_vertex_interface* get_thread_reference_vertex(d1::wait_tree_vertex_interface* top_wait_context) {
+    __TBB_ASSERT(top_wait_context, nullptr);
+    auto& dispatcher = *governor::get_thread_data()->my_task_dispatcher;
+
+    d1::reference_vertex* ref_counter{nullptr};
+    auto& reference_map = dispatcher.m_reference_vertex_map;
+    auto pos = reference_map.find(top_wait_context);
+    if (pos != reference_map.end()) {
+        ref_counter = pos->second;
+    } else {
+        constexpr std::size_t max_reference_vertex_map_size = 1000;
+        if (reference_map.size() > max_reference_vertex_map_size) {
+            // TODO: Research the possibility of using better approach for a clean-up
+            for (auto it = reference_map.begin(); it != reference_map.end();) {
+                if (it->second->get_num_child() == 0) {
+                    it->second->~reference_vertex();
+                    cache_aligned_deallocate(it->second);
+                    it = reference_map.erase(it);
+                } else {
+                    ++it;
+                }
+            }
+        }
+
+        reference_map[top_wait_context] = ref_counter =
+            new (cache_aligned_allocate(sizeof(d1::reference_vertex))) d1::reference_vertex(top_wait_context, 0);
+    }
+
+    return ref_counter;
+}
+
 } // namespace r1
 } // namespace detail
 } // namespace tbb
-
diff --git a/src/tbb/task_dispatcher.h b/src/tbb/task_dispatcher.h
index 20c7c731a7..c818934e5a 100644
--- a/src/tbb/task_dispatcher.h
+++ b/src/tbb/task_dispatcher.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2023 Intel Corporation
+    Copyright (c) 2020-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -249,15 +249,21 @@ d1::task* task_dispatcher::local_wait_for_all(d1::task* t, Waiter& waiter ) {
         task_dispatcher& task_disp;
         execution_data_ext old_execute_data_ext;
         properties old_properties;
+        bool is_initially_registered;
 
         ~dispatch_loop_guard() {
             task_disp.m_execute_data_ext = old_execute_data_ext;
             task_disp.m_properties = old_properties;
 
+            if (!is_initially_registered) {
+                task_disp.m_thread_data->my_arena->my_tc_client.get_pm_client()->unregister_thread();
+                task_disp.m_thread_data->my_is_registered = false;
+            }
+
             __TBB_ASSERT(task_disp.m_thread_data && governor::is_thread_data_set(task_disp.m_thread_data), nullptr);
             __TBB_ASSERT(task_disp.m_thread_data->my_task_dispatcher == &task_disp, nullptr);
         }
-    } dl_guard{ *this, m_execute_data_ext, m_properties };
+    } dl_guard{ *this, m_execute_data_ext, m_properties, m_thread_data->my_is_registered };
 
     // The context guard to track fp setting and itt tasks.
     context_guard_helper</*report_tasks=*/ITTPossible> context_guard;
@@ -282,6 +288,11 @@ d1::task* task_dispatcher::local_wait_for_all(d1::task* t, Waiter& waiter ) {
     m_properties.outermost = false;
     m_properties.fifo_tasks_allowed = false;
 
+    if (!dl_guard.is_initially_registered) {
+        m_thread_data->my_arena->my_tc_client.get_pm_client()->register_thread();
+        m_thread_data->my_is_registered = true;
+    }
+
     t = get_critical_task(t, ed, isolation, critical_allowed);
     if (t && m_thread_data->my_inbox.is_idle_state(true)) {
         // The thread has a work to do. Therefore, marking its inbox as not idle so that
diff --git a/src/tbb/tcm.h b/src/tbb/tcm.h
index 05fe0434eb..66ee18a2f0 100644
--- a/src/tbb/tcm.h
+++ b/src/tbb/tcm.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2023 Intel Corporation
+    Copyright (c) 2023-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -50,7 +50,8 @@ typedef struct _tcm_permit_flags_t {
   uint32_t stale : 1;
   uint32_t rigid_concurrency : 1;
   uint32_t exclusive : 1;
-  uint32_t reserved : 29;
+  uint32_t request_as_inactive : 1;
+  uint32_t reserved : 28;
 } tcm_permit_flags_t;
 
 typedef struct _tcm_callback_flags_t {
diff --git a/src/tbb/tcm_adaptor.cpp b/src/tbb/tcm_adaptor.cpp
index e20ebb831d..85ca125b4e 100644
--- a/src/tbb/tcm_adaptor.cpp
+++ b/src/tbb/tcm_adaptor.cpp
@@ -170,7 +170,7 @@ class tcm_client : public pm_client {
         __TBB_ASSERT_EX(res == TCM_RESULT_SUCCESS, nullptr);
     }
 
-    void init(d1::constraints& constraints) {
+    void init(tcm_client_id_t client_id, d1::constraints& constraints) {
         __TBB_ASSERT(tcm_request_permit, nullptr);
         __TBB_ASSERT(tcm_deactivate_permit, nullptr);
 
@@ -190,6 +190,12 @@ class tcm_client : public pm_client {
 
         my_permit_request.min_sw_threads = 0;
         my_permit_request.max_sw_threads = 0;
+        my_permit_request.flags.request_as_inactive = 1;
+
+        tcm_result_t res = tcm_request_permit(client_id, my_permit_request, this, &my_permit_handle, nullptr);
+        __TBB_ASSERT_EX(res == TCM_RESULT_SUCCESS, nullptr);
+
+        my_permit_request.flags.request_as_inactive = 0;
     }
 
     void register_thread() override {
@@ -279,7 +285,7 @@ pm_client* tcm_adaptor::create_client(arena& a) {
 }
 
 void tcm_adaptor::register_client(pm_client* c, d1::constraints& constraints) {
-    static_cast<tcm_client*>(c)->init(constraints);
+    static_cast<tcm_client*>(c)->init(my_impl->client_id, constraints);
 }
 
 void tcm_adaptor::unregister_and_destroy_client(pm_client& c) {
diff --git a/src/tbb/thread_data.h b/src/tbb/thread_data.h
index 9dfa492a72..422ec694ec 100644
--- a/src/tbb/thread_data.h
+++ b/src/tbb/thread_data.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2023 Intel Corporation
+    Copyright (c) 2020-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -101,6 +101,7 @@ class thread_data : public ::rml::job
     thread_data(unsigned short index, bool is_worker)
         : my_arena_index{ index }
         , my_is_worker{ is_worker }
+        , my_is_registered { false }
         , my_task_dispatcher{ nullptr }
         , my_arena{ nullptr }
         , my_last_client{ nullptr }
@@ -145,6 +146,8 @@ class thread_data : public ::rml::job
     //! Indicates if the thread is created by RML
     const bool my_is_worker;
 
+    bool my_is_registered;
+
     //! The current task dipsatcher
     task_dispatcher* my_task_dispatcher;
 
diff --git a/src/tbb/thread_request_serializer.cpp b/src/tbb/thread_request_serializer.cpp
index 6019f732b4..41cf51b0b0 100644
--- a/src/tbb/thread_request_serializer.cpp
+++ b/src/tbb/thread_request_serializer.cpp
@@ -100,13 +100,12 @@ void thread_request_serializer_proxy::set_active_num_workers(int soft_limit) {
 
     if (soft_limit != 0) {
         my_is_mandatory_concurrency_enabled = false;
-        my_serializer.set_active_num_workers(soft_limit);
-    } else {
-        if (my_num_mandatory_requests > 0 && !my_is_mandatory_concurrency_enabled) {
-            my_is_mandatory_concurrency_enabled = true;
-            my_serializer.set_active_num_workers(1);
-        }
+    } else if (my_num_mandatory_requests > 0) {
+        my_is_mandatory_concurrency_enabled = true;
+        soft_limit = 1;
     }
+
+    my_serializer.set_active_num_workers(soft_limit);
 }
 
 int thread_request_serializer_proxy::num_workers_requested() { return my_serializer.num_workers_requested(); }
diff --git a/src/tbb/threading_control.cpp b/src/tbb/threading_control.cpp
index 1ca1837826..7a62b337f6 100644
--- a/src/tbb/threading_control.cpp
+++ b/src/tbb/threading_control.cpp
@@ -334,7 +334,12 @@ bool threading_control::try_destroy_client(threading_control::client_snapshot de
 }
 
 void threading_control::set_active_num_workers(unsigned soft_limit) {
-    threading_control* thr_control = get_threading_control(/*public = */ false);
+    threading_control* thr_control{nullptr};
+    {
+        global_mutex_type::scoped_lock lock(g_threading_control_mutex);
+        thr_control = get_threading_control(/*public = */ false);
+    }
+
     if (thr_control != nullptr) {
         thr_control->my_pimpl->set_active_num_workers(soft_limit);
         thr_control->release(/*is_public=*/false, /*blocking_terminate=*/false);
diff --git a/src/tbbbind/tbb_bind.cpp b/src/tbbbind/tbb_bind.cpp
index 50119e4e54..bb52e11517 100644
--- a/src/tbbbind/tbb_bind.cpp
+++ b/src/tbbbind/tbb_bind.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2019-2023 Intel Corporation
+    Copyright (c) 2019-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -88,12 +88,15 @@ class system_topology {
         if ( hwloc_topology_init( &topology ) == 0 ) {
             initialization_state = topology_allocated;
 #if __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT
-            if ( groups_num == 1 &&
-                 hwloc_topology_set_flags(topology,
-                     HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
-                     HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING
-                 ) != 0
-            ) {
+            unsigned long flags = 0;
+            if (groups_num > 1) {
+                // HWLOC x86 backend might interfere with process affinity mask on
+                // Windows systems with multiple processor groups.
+                flags = HWLOC_TOPOLOGY_FLAG_DONT_CHANGE_BINDING;
+            } else {
+                flags = HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING;
+            }
+            if (hwloc_topology_set_flags(topology, flags) != 0) {
                 return;
             }
 #endif
diff --git a/src/tbbmalloc/frontend.cpp b/src/tbbmalloc/frontend.cpp
index 77f9d6594e..c9aaf46337 100644
--- a/src/tbbmalloc/frontend.cpp
+++ b/src/tbbmalloc/frontend.cpp
@@ -817,6 +817,7 @@ unsigned int getSmallObjectIndex(unsigned int size)
 /*
  * Depending on indexRequest, for a given size return either the index into the bin
  * for objects of this size, or the actual size of objects in this bin.
+ * TODO: Change return type to unsigned short.
  */
 template<bool indexRequest>
 static unsigned int getIndexOrObjectSize (unsigned int size)
@@ -1581,6 +1582,7 @@ void Block::initEmptyBlock(TLSData *tls, size_t size)
     unsigned int objSz = getObjectSize(size);
 
     cleanBlockHeader();
+    MALLOC_ASSERT(objSz <= USHRT_MAX, "objSz must not be less 2^16-1");
     objectSize = objSz;
     markOwned(tls);
     // bump pointer should be prepared for first allocation - thus mode it down to objectSize
diff --git a/src/tbbmalloc/large_objects.h b/src/tbbmalloc/large_objects.h
index 8519784267..58d7c81a7b 100644
--- a/src/tbbmalloc/large_objects.h
+++ b/src/tbbmalloc/large_objects.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -81,18 +81,25 @@ struct HugeBinStructureProps {
 
     static size_t alignToBin(size_t size) {
         MALLOC_ASSERT(size >= StepFactor, "Size must not be less than the StepFactor");
-        size_t minorStepExp = BitScanRev(size) - StepFactorExp;
+
+        int sizeExp = (int)BitScanRev(size);
+        MALLOC_ASSERT(sizeExp >= 0, "BitScanRev() cannot return -1, as size >= stepfactor > 0");
+        MALLOC_ASSERT(sizeExp >= StepFactorExp, "sizeExp >= StepFactorExp, because size >= stepFactor");
+        int minorStepExp = sizeExp - StepFactorExp;
+
         return alignUp(size, 1ULL << minorStepExp);
     }
 
     // Sizes between the power of 2 values are approximated to StepFactor.
     static int sizeToIdx(size_t size) {
         MALLOC_ASSERT(MinSize <= size && size <= MaxSize, ASSERT_TEXT);
+
         int sizeExp = (int)BitScanRev(size); // same as __TBB_Log2
-        MALLOC_ASSERT(sizeExp >= 0, "A shift amount (sizeExp) must not be negative");
-        size_t majorStepSize = 1ULL << sizeExp;
+        MALLOC_ASSERT(sizeExp >= 0, "BitScanRev() cannot return -1, as size >= stepfactor > 0");
+        MALLOC_ASSERT(sizeExp >= StepFactorExp, "sizeExp >= StepFactorExp, because size >= stepFactor");
         int minorStepExp = sizeExp - StepFactorExp;
-        MALLOC_ASSERT(minorStepExp >= 0, "A shift amount (minorStepExp) must not be negative");
+
+        size_t majorStepSize = 1ULL << sizeExp;
         int minorIdx = (size - majorStepSize) >> minorStepExp;
         MALLOC_ASSERT(size == majorStepSize + ((size_t)minorIdx << minorStepExp),
             "Size is not aligned on the bin");
diff --git a/src/tbbmalloc/tbbmalloc_internal.h b/src/tbbmalloc/tbbmalloc_internal.h
index 44fa47aaab..bc0ee2ffb5 100644
--- a/src/tbbmalloc/tbbmalloc_internal.h
+++ b/src/tbbmalloc/tbbmalloc_internal.h
@@ -232,9 +232,13 @@ template<unsigned NUM>
 class BitMaskMax : public BitMaskBasic<NUM> {
 public:
     void set(size_t idx, bool val) {
+        MALLOC_ASSERT(NUM >= idx + 1, ASSERT_TEXT);
+
         BitMaskBasic<NUM>::set(NUM - 1 - idx, val);
     }
     int getMaxTrue(unsigned startIdx) const {
+        MALLOC_ASSERT(NUM >= startIdx + 1, ASSERT_TEXT);
+
         int p = BitMaskBasic<NUM>::getMinTrue(NUM-startIdx-1);
         return -1==p? -1 : (int)NUM - 1 - p;
     }
@@ -496,7 +500,11 @@ class HugePagesStatus {
         MALLOC_ASSERT(!pageSize, "Huge page size can't be set twice. Double initialization.");
 
         // Initialize object variables
-        pageSize       = hugePageSize * 1024; // was read in KB from meminfo
+        if (hugePageSize > -1) {
+            pageSize = hugePageSize * 1024; // was read in KB from meminfo
+        } else {
+            pageSize = 0;
+        }
         isHPAvailable  = hpAvailable;
         isTHPAvailable = thpAvailable;
     }
diff --git a/src/tbbmalloc_proxy/proxy.cpp b/src/tbbmalloc_proxy/proxy.cpp
index 23b9c19c1c..954583ba5f 100644
--- a/src/tbbmalloc_proxy/proxy.cpp
+++ b/src/tbbmalloc_proxy/proxy.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -431,14 +431,12 @@ void __TBB_malloc__free_base(void *ptr)
 const char* known_bytecodes[] = {
 #if _WIN64
 //  "========================================================" - 56 symbols
+    "E9********CCCC",         // multiple - jmp(0xE9) with address followed by empty space (0xCC - INT 3)
     "4883EC284885C974",       // release free()
     "4883EC284885C975",       // release _msize()
     "4885C974375348",         // release free() 8.0.50727.42, 10.0
-    "E907000000CCCC",         // release _aligned_msize(), _aligned_free() ucrtbase.dll
     "C7442410000000008B",     // release free() ucrtbase.dll 10.0.14393.33
-    "E90B000000CCCC",         // release _msize() ucrtbase.dll 10.0.14393.33
     "48895C24085748",         // release _aligned_msize() ucrtbase.dll 10.0.14393.33
-    "E903000000CCCC",         // release _aligned_msize() ucrtbase.dll 10.0.16299.522
     "48894C24084883EC28BA",   // debug prologue
     "4C894424184889542410",   // debug _aligned_msize() 10.0
     "48894C24084883EC2848",   // debug _aligned_free 10.0
@@ -602,8 +600,8 @@ _expand (by dummy implementation)
 ??_V@YAXPEAX@Z    void * operator new[](unsigned __int64) (intel64)
 ??3@YAXPEAX@Z     operator delete                         (intel64)
 ??_V@YAXPEAX@Z    operator delete[]                       (intel64)
-??2@YAPAXIABUnothrow_t@std@@@Z      void * operator new (size_t sz, const std::nothrow_t&) throw()  (optional)
-??_U@YAPAXIABUnothrow_t@std@@@Z     void * operator new[] (size_t sz, const std::nothrow_t&) throw() (optional)
+??2@YAPAXIABUnothrow_t@std@@@Z      void * operator new (size_t sz, const std::nothrow_t&) noexcept  (optional)
+??_U@YAPAXIABUnothrow_t@std@@@Z     void * operator new[] (size_t sz, const std::nothrow_t&) noexcept (optional)
 
 and these functions have runtime-specific replacement:
 realloc
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index cfde681bd6..fb4a78bdbb 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -79,7 +79,8 @@ function(tbb_add_test)
         $<$<CONFIG:DEBUG>:TBB_USE_DEBUG>
         $<$<BOOL:${TBB_CPF}>:__TBB_CPF_BUILD=1>
         $<$<NOT:$<BOOL:${BUILD_SHARED_LIBS}>>:__TBB_DYNAMIC_LOAD_ENABLED=0>
-        $<$<NOT:$<BOOL:${BUILD_SHARED_LIBS}>>:__TBB_SOURCE_DIRECTLY_INCLUDED=1>)
+        $<$<NOT:$<BOOL:${BUILD_SHARED_LIBS}>>:__TBB_SOURCE_DIRECTLY_INCLUDED=1>
+        $<$<BOOL:${TBB_TCM_TESTING}>:__TBB_TCM_TESTING_ENABLED=1>)
 
     target_link_libraries(${_tbb_test_TARGET_NAME} PRIVATE ${_tbb_test_DEPENDENCIES} Threads::Threads ${TBB_COMMON_LINK_LIBS})
 
@@ -380,7 +381,9 @@ if (TARGET TBB::tbb)
     # Define the tests
     tbb_add_test(SUBDIR tbb NAME test_tick_count DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR tbb NAME test_allocators DEPENDENCIES TBB::tbb)
-    tbb_add_test(SUBDIR tbb NAME test_arena_priorities DEPENDENCIES TBB::tbb)
+    if (NOT TBB_TCM_TESTING)
+        tbb_add_test(SUBDIR tbb NAME test_arena_priorities DEPENDENCIES TBB::tbb)
+    endif()
     tbb_add_test(SUBDIR tbb NAME test_dynamic_link DEPENDENCIES TBB::tbb)
     if (LINKER_HAS_NO_AS_NEEDED)
         # The linker may not detect a dependency on pthread in static variable constructors.
@@ -389,7 +392,7 @@ if (TARGET TBB::tbb)
     if (APPLE OR ANDROID_PLATFORM)
         target_link_libraries(test_dynamic_link PRIVATE -rdynamic)
     endif()
-    if (WIN32)
+    if (WIN32 AND NOT TBB_TCM_TESTING)
         tbb_add_test(SUBDIR tbb NAME test_numa_dist DEPENDENCIES TBB::tbb)
     endif()
     tbb_add_test(SUBDIR tbb NAME test_collaborative_call_once DEPENDENCIES TBB::tbb)
@@ -451,8 +454,23 @@ if (TARGET TBB::tbb)
     tbb_add_test(SUBDIR tbb NAME test_environment_whitebox DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR tbb NAME test_hw_concurrency DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR tbb NAME test_eh_thread DEPENDENCIES TBB::tbb)
-    tbb_add_test(SUBDIR tbb NAME test_global_control DEPENDENCIES TBB::tbb)
+    if (NOT TBB_TCM_TESTING)
+        tbb_add_test(SUBDIR tbb NAME test_global_control DEPENDENCIES TBB::tbb)
+    endif()
     tbb_add_test(SUBDIR tbb NAME test_task DEPENDENCIES TBB::tbb)
+    if (TBB_TCM_TESTING AND NOT WINDOWS_STORE AND NOT TBB_WINDOWS_DRIVER)
+        add_test(NAME test_tcm_enabled COMMAND test_task --force-colors=1 WORKING_DIRECTORY ${TBB_TEST_WORKING_DIRECTORY})
+        set_tests_properties(test_tcm_enabled PROPERTIES
+            ENVIRONMENT "TBB_VERSION=1;TCM_ENABLE=1"
+            PASS_REGULAR_EXPRESSION "TCM: VERSION.*"
+            FAIL_REGULAR_EXPRESSION "TCM: TCM *disabled"
+        )
+        add_test(NAME test_tcm_disabled COMMAND test_task --force-colors=1 WORKING_DIRECTORY ${TBB_TEST_WORKING_DIRECTORY})
+        set_tests_properties(test_tcm_disabled PROPERTIES
+            ENVIRONMENT "TBB_VERSION=1;TCM_ENABLE=0"
+            PASS_REGULAR_EXPRESSION "TCM: TCM *disabled"
+        )
+    endif()
 
     if (TBB_FUZZ_TESTING AND NOT WIN32)
         if (NOT ((CMAKE_CXX_COMPILER_ID STREQUAL Clang) OR (CMAKE_CXX_COMPILER_ID STREQUAL IntelLLVM)))
@@ -521,7 +539,9 @@ if (TARGET TBB::tbb)
     tbb_add_test(SUBDIR conformance NAME conformance_blocked_range3d DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR conformance NAME conformance_blocked_rangeNd DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR conformance NAME conformance_concurrent_vector DEPENDENCIES TBB::tbb)
-    tbb_add_test(SUBDIR conformance NAME conformance_global_control DEPENDENCIES TBB::tbb)
+    if (NOT TBB_TCM_TESTING)
+        tbb_add_test(SUBDIR conformance NAME conformance_global_control DEPENDENCIES TBB::tbb)
+    endif()
     tbb_add_test(SUBDIR conformance NAME conformance_concurrent_hash_map DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR conformance NAME conformance_enumerable_thread_specific DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR conformance NAME conformance_combinable DEPENDENCIES TBB::tbb)
diff --git a/test/common/exception_handling.h b/test/common/exception_handling.h
index 55dbe0fc20..1d1b62c3ba 100644
--- a/test/common/exception_handling.h
+++ b/test/common/exception_handling.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -102,7 +102,7 @@ class test_exception : public std::exception {
 public:
     test_exception ( const char* description ) : my_description(description) {}
 
-    const char* what() const throw() override { return my_description; }
+    const char* what() const noexcept override { return my_description; }
 };
 
 class solitary_test_exception : public test_exception {
diff --git a/test/common/graph_utils.h b/test/common/graph_utils.h
index 24814d5fd3..2c2099f6df 100644
--- a/test/common/graph_utils.h
+++ b/test/common/graph_utils.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@
 
 #include "common/spin_barrier.h"
 
-using tbb::detail::d1::SUCCESSFULLY_ENQUEUED;
+using tbb::detail::d2::SUCCESSFULLY_ENQUEUED;
 
 // Needed conversion to and from continue_msg, but didn't want to add
 // conversion operators to the class, since we don't want it in general,
@@ -277,11 +277,17 @@ struct harness_counting_receiver : public tbb::flow::receiver<T> {
         return my_graph;
     }
 
-    tbb::detail::d1::graph_task *try_put_task( const T & ) override {
+    tbb::detail::d2::graph_task *try_put_task( const T & ) override {
       ++my_count;
-      return const_cast<tbb::detail::d1::graph_task*>(SUCCESSFULLY_ENQUEUED);
+      return const_cast<tbb::detail::d2::graph_task*>(SUCCESSFULLY_ENQUEUED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    tbb::detail::d2::graph_task *try_put_task( const T &t, const tbb::detail::d2::message_metainfo& ) override {
+      return try_put_task(t);
+    }
+#endif
+
     void validate() {
         size_t n = my_count;
         CHECK( n == num_copies*max_value );
@@ -323,14 +329,20 @@ struct harness_mapped_receiver : public tbb::flow::receiver<T> {
        my_multiset = new multiset_type;
     }
 
-    tbb::detail::d1::graph_task* try_put_task( const T &t ) override {
+    tbb::detail::d2::graph_task* try_put_task( const T &t ) override {
       if ( my_multiset ) {
           (*my_multiset).emplace( t );
       } else {
           ++my_count;
       }
-      return const_cast<tbb::detail::d1::graph_task*>(SUCCESSFULLY_ENQUEUED);
+      return const_cast<tbb::detail::d2::graph_task*>(SUCCESSFULLY_ENQUEUED);
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    tbb::detail::d2::graph_task *try_put_task( const T &t, const tbb::detail::d2::message_metainfo& ) override {
+      return try_put_task(t);
     }
+#endif
 
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
@@ -404,6 +416,12 @@ struct harness_counting_sender : public tbb::flow::sender<T> {
         }
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool try_get( T & v, tbb::detail::d2::message_metainfo& ) override {
+        return try_get(v);
+    }
+#endif
+
     bool try_put_once() {
         successor_type *s = my_receiver;
         size_t i = my_count++;
@@ -842,7 +860,7 @@ struct throwing_body{
         if(my_counter == Threshold)
             throw Threshold;
     }
-    
+
     template<typename input_type>
     output_tuple_type operator()(const input_type&) {
         ++my_counter;
diff --git a/test/common/utils_dynamic_libs.h b/test/common/utils_dynamic_libs.h
index 5e5365fc8f..99afca3840 100644
--- a/test/common/utils_dynamic_libs.h
+++ b/test/common/utils_dynamic_libs.h
@@ -58,7 +58,7 @@ namespace utils {
 #define EXT ".dylib"
 #endif
 // Android SDK build system does not support .so file name versioning
-#elif __FreeBSD__ || __NetBSD__ || __sun || _AIX || __ANDROID__
+#elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX || __ANDROID__
 #define EXT ".so"
 #elif __unix__  // Order of these elif's matters!
 #define EXT __TBB_STRING(.so.2)
diff --git a/test/conformance/conformance_concurrent_hash_map.cpp b/test/conformance/conformance_concurrent_hash_map.cpp
index 0c3ec6e93a..889739b9d0 100644
--- a/test/conformance/conformance_concurrent_hash_map.cpp
+++ b/test/conformance/conformance_concurrent_hash_map.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -35,8 +35,8 @@
     that concurrent_hash_map uses only the required interface. */
 class MyException : public std::bad_alloc {
 public:
-    virtual const char *what() const throw() override { return "out of items limit"; }
-    virtual ~MyException() throw() {}
+    virtual const char *what() const noexcept override { return "out of items limit"; }
+    virtual ~MyException() noexcept {}
 };
 
 /** Has tightly controlled interface so that we can verify
diff --git a/test/conformance/conformance_concurrent_queue.cpp b/test/conformance/conformance_concurrent_queue.cpp
index 32c1652e94..9bda9a4613 100644
--- a/test/conformance/conformance_concurrent_queue.cpp
+++ b/test/conformance/conformance_concurrent_queue.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -777,8 +777,8 @@ void TestConcurrentPushPop() {
 
 class Foo_exception : public std::bad_alloc {
 public:
-    virtual const char *what() const throw() override { return "out of Foo limit"; }
-    virtual ~Foo_exception() throw() {}
+    virtual const char *what() const noexcept override { return "out of Foo limit"; }
+    virtual ~Foo_exception() noexcept {}
 };
 
 #if TBB_USE_EXCEPTIONS
diff --git a/test/conformance/conformance_global_control.cpp b/test/conformance/conformance_global_control.cpp
index 578ae78019..250fda906b 100644
--- a/test/conformance/conformance_global_control.cpp
+++ b/test/conformance/conformance_global_control.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include "common/spin_barrier.h"
 #include "common/utils.h"
 #include "common/utils_concurrency_limit.h"
+#include "common/cpu_usertime.h"
 
 #include "oneapi/tbb/global_control.h"
 #include "oneapi/tbb/parallel_for.h"
@@ -347,6 +348,30 @@ TEST_CASE("simple prolong lifetime 3") {
     tbb::parallel_for(0, 10, utils::DummyBody());
 }
 
+//! \brief \ref regression \ref interface \ref requirement
+TEST_CASE("Test worker threads remain inactive in enforced serial execution mode") {
+    auto num_threads = utils::get_platform_max_threads();
+    utils::SpinBarrier barrier{num_threads};
+
+    // Warm-up threads
+    tbb::parallel_for(std::size_t(0), num_threads, [&] (std::size_t) {
+        barrier.wait();
+    });
+
+    tbb::global_control control(tbb::global_control::max_allowed_parallelism, 1);
+
+    std::thread thr([&] {
+        tbb::parallel_for(0, 100000, [&] (int) {
+            utils::doDummyWork(100);
+        });
+    });
+
+    // Workers should sleep because of global_control enforced serial execution of tasks
+    TestCPUUserTime(utils::get_platform_max_threads() - 1);
+
+    thr.join();
+}
+
 // The test cannot work correctly with statically linked runtime.
 // TODO: investigate a failure in debug with MSVC
 #if (!_MSC_VER || (defined(_DLL) && !defined(_DEBUG))) && !EMSCRIPTEN
diff --git a/test/conformance/conformance_graph.cpp b/test/conformance/conformance_graph.cpp
index 3492660783..1f73999ff7 100644
--- a/test/conformance/conformance_graph.cpp
+++ b/test/conformance/conformance_graph.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2021 Intel Corporation
+    Copyright (c) 2020-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -158,6 +158,8 @@ void test_join_node_rf_reset_protocol(){
     CHECK_MESSAGE((!testing_node.try_get(tmp)), "All buffers must be emptied");
 }
 
+// global_control::max_allowed_parallelism functionality is not covered by TCM
+#if !__TBB_TCM_TESTING_ENABLED
 //! Graph reset
 //! \brief \ref requirement
 TEST_CASE("graph reset with rf_reset_protocol") {
@@ -179,6 +181,7 @@ TEST_CASE("graph reset with rf_reset_protocol") {
     test_limiter_node_rf_reset_protocol();
     test_join_node_rf_reset_protocol();
 }
+#endif
 
 //! Graph reset rf_clear_edges
 //! \brief \ref requirement
diff --git a/test/tbb/test_broadcast_node.cpp b/test/tbb/test_broadcast_node.cpp
index b3905e6d60..662a08331d 100644
--- a/test/tbb/test_broadcast_node.cpp
+++ b/test/tbb/test_broadcast_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@
 //! \brief Test for [flow_graph.broadcast_node] specification
 
 
-#define TBB_INTERNAL_NAMESPACE detail::d1
+#define TBB_INTERNAL_NAMESPACE detail::d2
 namespace tbb {
 using task = TBB_INTERNAL_NAMESPACE::graph_task;
 }
@@ -73,6 +73,12 @@ class counting_array_receiver : public tbb::flow::receiver<T> {
         return const_cast<tbb::task *>(SUCCESSFULLY_ENQUEUED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    tbb::task * try_put_task( const T &v, const tbb::detail::d2::message_metainfo& ) override {
+        return try_put_task(v);
+    }
+#endif
+
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
@@ -241,6 +247,166 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+// Basic idea of the following tests is to check that try_put_and_wait(message) call for broadcast_node
+// processes all of the previous jobs required to process message, the message itself, but does
+// not process the elements submitted later or not required to process the message
+// These tests submit start_work_items using the regular try_put and then submit wait_message
+// with try_put_and_wait. During the completion of the graph, new_work_items would be submitted
+// once the wait_message arrives.
+void test_try_put_and_wait_spawning_and_serial_receiver() {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items_unlimited, processed_items_serial;
+        std::vector<int> new_work_items;
+
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        tbb::flow::broadcast_node<int> broadcast(g);
+
+        // Broadcast to 2 function_nodes, one with unlimited concurrency and the other serial
+        tbb::flow::function_node<int, int, tbb::flow::queueing> unlimited(g, tbb::flow::unlimited,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        broadcast.try_put(item);
+                    }
+                }
+                processed_items_unlimited.emplace_back(input);
+                return 0;
+            });
+        tbb::flow::make_edge(broadcast, unlimited);
+
+        tbb::flow::function_node<int, int, tbb::flow::queueing> serial(g, tbb::flow::serial,
+            [&](int input) noexcept {
+                processed_items_serial.emplace_back(input);
+                return 0;
+            });
+        tbb::flow::make_edge(broadcast, serial);
+
+        for (int i = 0; i < wait_message; ++i) {
+            broadcast.try_put(i);
+        }
+
+        broadcast.try_put_and_wait(wait_message);
+
+        size_t unlimited_check_index = 0, serial_check_index = 0;
+
+        // For the unlimited function_node, all of the tasks for start_work_items and wait_message would be spawned
+        // and hence processed by the thread in LIFO order.
+        // The first processed item is expected to be wait_message since it was spawned last
+        CHECK_MESSAGE(processed_items_unlimited.size() == new_work_items.size() + start_work_items.size(),
+                      "Unexpected number of processed items");
+        CHECK_MESSAGE(processed_items_unlimited[unlimited_check_index++] == wait_message, "Unexpected items processing");
+        for (int i = int(new_work_items.size()) - 1; i >= 0; --i) {
+            CHECK_MESSAGE(processed_items_unlimited[unlimited_check_index++] == new_work_items[i], "Unexpected items processing");
+        }
+        for (int i = int(start_work_items.size()) - 1; i >= 1; --i) {
+            CHECK_MESSAGE(processed_items_unlimited[unlimited_check_index++] == start_work_items[i], "Unexpected items processing");
+        }
+
+        // Serial queueing function_node should add all start_work_items except the first one into the queue
+        // and then process them in FIFO order.
+        // wait_message would also be added to the queue, but would be processed later
+        CHECK_MESSAGE(processed_items_serial.size() == start_work_items.size() + 1,
+                      "Unexpected number of processed items");
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items_serial[serial_check_index++] == item, "Unexpected items processing");
+        }
+        CHECK_MESSAGE(processed_items_serial[serial_check_index++] == wait_message, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        CHECK_MESSAGE(processed_items_unlimited[unlimited_check_index++] == start_work_items[0], "Unexpected items processing");
+
+        // For serial queueing function_node, the new_work_items are expected to be processed while calling to wait_for_all
+        // They would be queued and processed later in FIFO order
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items_serial[serial_check_index++] == item, "Unexpected items processing");
+        }
+        CHECK(serial_check_index == processed_items_serial.size());
+        CHECK(unlimited_check_index == processed_items_unlimited.size());
+    });
+}
+
+void test_try_put_and_wait_spawning_receivers() {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        int wait_message = 10;
+        int num_successors = wait_message - 1;
+
+        std::vector<int> start_work_items;
+        std::vector<std::vector<int>> processed_items(num_successors);
+        std::vector<int> new_work_items;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        tbb::flow::broadcast_node<int> broadcast(g);
+
+        std::vector<tbb::flow::function_node<int, int, tbb::flow::queueing>> successors;
+        successors.reserve(num_successors);
+        for (int i = 0; i < num_successors; ++i) {
+            successors.emplace_back(g, tbb::flow::unlimited,
+                [&, i](int input) noexcept {
+                    if (input == wait_message) {
+                        broadcast.try_put(new_work_items[i]);
+                    }
+                    processed_items[i].emplace_back(input);
+                    return 0;
+                });
+            tbb::flow::make_edge(broadcast, successors.back());
+        }
+
+        for (int i = 0; i < wait_message; ++i) {
+            broadcast.try_put(i);
+        }
+
+        broadcast.try_put_and_wait(wait_message);
+
+        for (int i = num_successors - 1; i >= 0; --i) {
+            size_t check_index = 0;
+            for (int j = num_successors - 1; j != i; --j) {
+                CHECK_MESSAGE(processed_items[i][check_index++] == new_work_items[j], "Unexpected items processing");
+            }
+            CHECK_MESSAGE(processed_items[i][check_index++] == wait_message, "Unexpected items processing");
+            for (int j = i; j >= 1; --j) {
+                CHECK_MESSAGE(processed_items[i][check_index++] == new_work_items[j], "Unexpected items processing");
+            }
+        }
+
+        g.wait_for_all();
+
+        for (auto& processed_item : processed_items) {
+            size_t check_index = num_successors;
+            CHECK_MESSAGE(processed_item[check_index++] == new_work_items[0], "Unexpected items processing");
+            for (int i = int(start_work_items.size()) - 1; i >= 0; --i) {
+                CHECK_MESSAGE(processed_item[check_index++] == start_work_items[i], "Unexpected items processing");
+            }
+        }
+    });
+}
+
+void test_try_put_and_wait() {
+    test_try_put_and_wait_spawning_and_serial_receiver();
+    test_try_put_and_wait_spawning_receivers();
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test serial broadcasts
 //! \brief \ref error_guessing
 TEST_CASE("Serial broadcasts"){
@@ -282,3 +448,9 @@ TEST_CASE("Deduction guides"){
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test broadcast_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_buffer_node.cpp b/test/tbb/test_buffer_node.cpp
index 89f4485b3d..527005aecb 100644
--- a/test/tbb/test_buffer_node.cpp
+++ b/test/tbb/test_buffer_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -24,11 +24,11 @@
 #include "common/graph_utils.h"
 #include "common/test_follows_and_precedes_api.h"
 
+#include "test_buffering_try_put_and_wait.h"
 
 //! \file test_buffer_node.cpp
 //! \brief Test for [flow_graph.buffer_node] specification
 
-
 #define N 1000
 #define C 10
 
@@ -307,7 +307,7 @@ int test_parallel(int num_threads) {
 // Chained buffers ( 2 & 3 ), single sender, items at last buffer in arbitrary order
 //
 
-#define TBB_INTERNAL_NAMESPACE detail::d1
+#define TBB_INTERNAL_NAMESPACE detail::d2
 using tbb::TBB_INTERNAL_NAMESPACE::register_predecessor;
 using tbb::TBB_INTERNAL_NAMESPACE::remove_predecessor;
 
@@ -455,6 +455,161 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_buffer_node_try_put_and_wait() {
+    using namespace test_try_put_and_wait;
+
+    std::vector<int> start_work_items;
+    std::vector<int> new_work_items;
+    int wait_message = 10;
+
+    for (int i = 0; i < wait_message; ++i) {
+        start_work_items.emplace_back(i);
+        new_work_items.emplace_back(i + 1 + wait_message);
+    }
+
+    // Test push
+    // test_buffer_push tests the graph
+    // buffer1 -> function -> buffer2 -> writer
+    //     function is a queueing serial function_node that submits new_work_items once wait_message arrives
+    //     writer is an unlimited function_node that writes an item into the processed_items vector
+    // Test steps
+    //     1. push start_work_items into the buffer1
+    //     2. buffer1.try_put_and_wait(wait_message);
+    //     3. g.wait_for_all()
+    // test_buffer_push returns the index from which the items processed during wait_for_all() starts
+    {
+        std::vector<int> processed_items;
+
+        std::size_t after_start = test_buffer_push<tbb::flow::buffer_node<int>>(start_work_items, wait_message,
+                                                                                new_work_items, processed_items);
+
+        // Expected effect:
+        // During buffer1.try_put_and_wait()
+        //     1. start_work_items would be pushed to buffer1
+        //     2. wait_message would be pushed to buffer1
+        //     3. forward_task on buffer1 would transfer all of the items to the function_node in LIFO order
+        //     4. wait_message would occupy concurrency of function, other items would be pushed to the queue
+        //     5. function would process wait_message and add new_work_items to the buffer1
+        //     6. forward_task for new_work_items would be spawned, wait_message would be buffered in the buffer2
+        //     7. function task for next FIFO item in the queue would be spawned
+        //     8. forward_task for wait_message in buffer2 would be executed without spawning
+        //     9. writer task for wait_message would be executed without spawning and write wait_message to the buffer
+        //     10. try_put_and_wait exits since wait_message is completed
+        // During g.wait_for_all()
+        //     10. forward_task for new_work_items in buffer1 would be spawned and put items in function in LIFO order
+        //     11. function_node would process and push forward items from the queue in FIFO order
+        // Expected items processing - { wait_message, start_work_items LIFO, new_work_items LIFO }
+
+        std::size_t check_index = 0;
+        CHECK_MESSAGE(after_start == 1, "try_put_and_wait should process only the wait_message");
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "try_put_and_wait should process only the wait_message");
+
+        for (std::size_t index = start_work_items.size(); index != 0; --index) {
+            CHECK_MESSAGE(processed_items[check_index++] == start_work_items[index - 1],
+                          "wait_for_all should process start_work_items LIFO");
+        }
+        for (std::size_t index = new_work_items.size(); index != 0; --index) {
+            CHECK_MESSAGE(processed_items[check_index++] == new_work_items[index - 1],
+                          "wait_for_all should process new_work_items LIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    } // Test push
+
+    // Test pull
+    // test_buffer_pull tests the graph
+    // buffer -> function
+    //     function is a rejecting serial function_node that submits new_work_items once wait_message arrives
+    //     and writes the processed item into the processed_items
+    // Test steps
+    //     1. push the occupier message to the function
+    //     2. push start_work_items into the buffer
+    //     3. buffer.try_put_and_wait(wait_message)
+    //     4. g.wait_for_all()
+    // test_buffer_pull returns the index from which the items processed during wait_for_all() starts
+
+    {
+        std::vector<int> processed_items;
+        int occupier = 42;
+
+        std::size_t after_start = test_buffer_pull<tbb::flow::buffer_node<int>>(start_work_items, wait_message, occupier,
+                                                                                new_work_items, processed_items);
+
+        // Expected effect
+        // 0. task for occupier processing would be spawned by the function
+        // During buffer.try_put_and_wait()
+        //     1. start_work_items would be pushed to the buffer
+        //     2. wait_message would be pushed to the buffer
+        //     3. forward_task would try to push items to the function, but would fail
+        //        and set the edge to the pull state
+        //     4. occupier would be processed
+        //     5. items would be taken from the buffer by function in LIFO order
+        //     6. wait_message would be taken first and push new_work_items to the buffer
+        // Expected items processing { occupier, wait_message, new_work_items LIFO, start_work_items LIFO }
+
+        std::size_t check_index = 0;
+
+        CHECK_MESSAGE(after_start == 2, "Only wait_message and occupier should be processed by try_put_and_wait");
+        CHECK_MESSAGE(processed_items[check_index++] == occupier, "Unexpected items processing by try_put_and_wait");
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing by try_put_and_wait");
+
+        for (std::size_t index = new_work_items.size(); index != 0; --index) {
+            CHECK_MESSAGE(processed_items[check_index++] == new_work_items[index - 1],
+                          "wait_for_all should process new_work_items LIFO");
+        }
+        for (std::size_t index = start_work_items.size(); index != 0; --index) {
+            CHECK_MESSAGE(processed_items[check_index++] == start_work_items[index - 1],
+                          "wait_for_all should process start_work_items LIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    }
+
+    // Test reserve
+    {
+        int thresholds[] = { 1, 2 };
+
+        for (int threshold : thresholds) {
+            std::vector<int> processed_items;
+
+            // test_buffer_reserve tests the following graph
+            // buffer -> limiter -> function
+            //  function is a rejecting serial function_node that puts an item to the decrementer port
+            //  of the limiter inside of the body
+
+            std::size_t after_start = test_buffer_reserve<tbb::flow::buffer_node<int>>(threshold,
+                start_work_items, wait_message, new_work_items, processed_items);
+
+            // Expected effect:
+            // 1. start_work_items would be pushed to the buffer
+            // 2. wait_message_would be pushed to the buffer
+            // 3. forward task of the buffer would push wait_message to the limiter node.
+            //    Since the limiter threshold is not reached, it would be directly passed to the function
+            // 4. function would spawn the task for wait_message processing
+            // 5. wait_message would be processed that would add new_work_items to the buffer
+            // 6. decrementer.try_put() would be called and the limiter node would
+            //    process all of the items from the buffer using the try_reserve/try_consume/try_release semantics
+            // Since the reservation always accepts the front element of the buffer
+            // it is expected that the items would be taken from the buffer in FIFO order
+            // instead of LIFO on try_get for buffer_node
+
+            std::size_t check_index = 0;
+
+            CHECK_MESSAGE(after_start == 1, "try_put_and_wait should process only wait_message");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+
+            for (auto item : start_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+            }
+
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+            }
+
+        }
+    }
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 #include <iomanip>
 
 //! Test buffer_node with parallel and serial neighbours
@@ -489,8 +644,15 @@ TEST_CASE("Follows and precedes API"){
 
 #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
 //! Test deduction guides
-//! \brief requirement
+//! \brief \ref requirement
 TEST_CASE("Deduction guides"){
     test_deduction_guides();
 }
 #endif
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test buffer_node try_put_and_wait") {
+    test_buffer_node_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_buffering_try_put_and_wait.h b/test/tbb/test_buffering_try_put_and_wait.h
new file mode 100644
index 0000000000..300521233f
--- /dev/null
+++ b/test/tbb/test_buffering_try_put_and_wait.h
@@ -0,0 +1,189 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_test_tbb_buffering_try_put_and_wait_H
+#define __TBB_test_tbb_buffering_try_put_and_wait_H
+
+#include <oneapi/tbb/task_arena.h>
+#include <oneapi/tbb/flow_graph.h>
+
+#include <vector>
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
+namespace test_try_put_and_wait {
+
+template <typename BufferingNode, typename... Args>
+std::size_t test_buffer_push(const std::vector<int>& start_work_items,
+                             int wait_message,
+                             const std::vector<int>& new_work_items,
+                             std::vector<int>& processed_items,
+                             Args... args)
+{
+    std::size_t after_try_put_and_wait_start_index = 0;
+    tbb::task_arena arena(1);
+
+    arena.execute([&] {
+        tbb::flow::graph g;
+
+        using function_node_type = tbb::flow::function_node<int, int>;
+
+        BufferingNode buffer1(g, args...);
+
+        function_node_type function(g, tbb::flow::serial,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        buffer1.try_put(item);
+                    }
+                }
+                return input;
+            });
+
+        BufferingNode buffer2(g, args...);
+
+        function_node_type writer(g, tbb::flow::unlimited,
+            [&](int input) noexcept {
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(buffer1, function);
+        tbb::flow::make_edge(function, buffer2);
+        tbb::flow::make_edge(buffer2, writer);
+
+        for (auto item : start_work_items) {
+            buffer1.try_put(item);
+        }
+
+        buffer1.try_put_and_wait(wait_message);
+
+        after_try_put_and_wait_start_index = processed_items.size();
+
+        g.wait_for_all();
+    });
+
+    return after_try_put_and_wait_start_index;
+}
+
+template <typename BufferingNode, typename... Args>
+std::size_t test_buffer_pull(const std::vector<int>& start_work_items,
+                             int wait_message,
+                             int occupier,
+                             const std::vector<int>& new_work_items,
+                             std::vector<int>& processed_items,
+                             Args... args)
+{
+    tbb::task_arena arena(1);
+    std::size_t after_try_put_and_wait_start_index = 0;
+
+    arena.execute([&] {
+        tbb::flow::graph g;
+
+        using function_node_type = tbb::flow::function_node<int, int, tbb::flow::rejecting>;
+
+        BufferingNode buffer(g, args...);
+
+        function_node_type function(g, tbb::flow::serial,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        buffer.try_put(item);
+                    }
+                }
+
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        // Occupy the concurrency of function_node
+        // This call spawns the task to process the occupier
+        function.try_put(occupier);
+
+        // Make edge between buffer and function after occupying the concurrency
+        // To ensure that forward task of the buffer would be spawned after the occupier task
+        // And the function_node would reject the items from the buffer
+        // and process them later by calling try_get on the buffer
+        tbb::flow::make_edge(buffer, function);
+
+        for (auto item : start_work_items) {
+            buffer.try_put(item);
+        }
+
+        buffer.try_put_and_wait(wait_message);
+
+        after_try_put_and_wait_start_index = processed_items.size();
+
+        g.wait_for_all();
+    });
+
+    return after_try_put_and_wait_start_index;
+}
+
+template <typename BufferingNode, typename... Args>
+std::size_t test_buffer_reserve(std::size_t limiter_threshold,
+                                const std::vector<int>& start_work_items,
+                                int wait_message,
+                                const std::vector<int>& new_work_items,
+                                std::vector<int>& processed_items,
+                                Args... args)
+{
+    tbb::task_arena arena(1);
+    std::size_t after_try_put_and_wait_start_index = 0;
+
+    arena.execute([&] {
+        tbb::flow::graph g;
+
+        BufferingNode buffer(g, args...);
+
+        tbb::flow::limiter_node<int, int> limiter(g, limiter_threshold);
+        tbb::flow::function_node<int, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+            [&](int input) {
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        buffer.try_put(item);
+                    }
+                }
+                // Explicitly put to the decrementer instead of making edge
+                // to guarantee that the next task would be spawned and not returned
+                // to the current thread as the next task
+                // Otherwise, all elements would be processed during the try_put_and_wait
+                limiter.decrementer().try_put(1);
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(buffer, limiter);
+        tbb::flow::make_edge(limiter, function);
+
+        for (auto item : start_work_items) {
+            buffer.try_put(item);
+        }
+
+        buffer.try_put_and_wait(wait_message);
+
+        after_try_put_and_wait_start_index = processed_items.size();
+
+        g.wait_for_all();
+    });
+
+    return after_try_put_and_wait_start_index;
+}
+
+} // test_try_put_and_wait
+
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+#endif // __TBB_test_tbb_buffering_try_put_and_wait_H
diff --git a/test/tbb/test_continue_node.cpp b/test/tbb/test_continue_node.cpp
index 8c2c5c5bb9..1cfea3df43 100644
--- a/test/tbb/test_continue_node.cpp
+++ b/test/tbb/test_continue_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ template< typename OutputType >
 void run_continue_nodes( int p, tbb::flow::graph& g, tbb::flow::continue_node< OutputType >& n ) {
     fake_continue_sender fake_sender;
     for (size_t i = 0; i < N; ++i) {
-        tbb::detail::d1::register_predecessor(n, fake_sender);
+        tbb::detail::d2::register_predecessor(n, fake_sender);
     }
 
     for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {
@@ -138,7 +138,7 @@ void continue_nodes_with_copy( ) {
         tbb::flow::continue_node< OutputType > exe_node( g, cf );
         fake_continue_sender fake_sender;
         for (size_t i = 0; i < N; ++i) {
-            tbb::detail::d1::register_predecessor(exe_node, fake_sender);
+            tbb::detail::d2::register_predecessor(exe_node, fake_sender);
         }
 
         for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {
@@ -354,6 +354,176 @@ void test_successor_cache_specialization() {
                   "Wrong number of messages is passed via continue_node");
 }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_try_put_and_wait_default() {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        int processed_items = 0;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg>* start_node = nullptr;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg> cont(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                static bool put_ten_msgs = true;
+                if (put_ten_msgs) {
+                    for (std::size_t i = 0; i < 10; ++i) {
+                        start_node->try_put(tbb::flow::continue_msg{});
+                    }
+                    put_ten_msgs = false;
+                }
+            });
+
+        start_node = &cont;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg, tbb::flow::lightweight> writer(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                ++processed_items;
+            });
+
+        tbb::flow::make_edge(cont, writer);
+
+        cont.try_put_and_wait(tbb::flow::continue_msg{});
+
+        // Only 1 item should be processed, with the additional 10 items having been spawned
+        CHECK_MESSAGE(processed_items == 1, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        // The additional 10 items should be processed
+        CHECK_MESSAGE(processed_items == 11, "Unexpected items processing");
+    });
+}
+
+void test_try_put_and_wait_lightweight() {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        tbb::flow::continue_node<int, tbb::flow::lightweight>* start_node = nullptr;
+
+        tbb::flow::continue_node<int, tbb::flow::lightweight> cont(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                static int counter = 0;
+                int i = counter++;
+                if (i == wait_message) {
+                    for (auto item : new_work_items) {
+                        (void)item;
+                        start_node->try_put(tbb::flow::continue_msg{});
+                    }
+                }
+                return i;
+            });
+
+        start_node = &cont;
+
+        tbb::flow::function_node<int, int, tbb::flow::lightweight> writer(g, tbb::flow::unlimited,
+            [&](int input) noexcept {
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(cont, writer);
+
+        for (auto item : start_work_items) {
+            (void)item;
+            cont.try_put(tbb::flow::continue_msg{});
+        }
+
+        cont.try_put_and_wait(tbb::flow::continue_msg{});
+
+        CHECK_MESSAGE(processed_items.size() == start_work_items.size() + new_work_items.size() + 1,
+                      "Unexpected number of elements processed");
+
+        std::size_t check_index = 0;
+
+        // For lightweight continue_node, start_work_items are expected to be processed first
+        // while putting items into the first node
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+        }
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+        }
+        // wait_message would be processed only after new_work_items
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        CHECK(check_index == processed_items.size());
+    });
+}
+
+void test_metainfo_buffering() {
+    tbb::task_arena arena(1);
+
+    arena.execute([&] {
+        tbb::flow::graph g;
+
+        std::vector<char> call_order;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg>* b_ptr = nullptr;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg> a(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                call_order.push_back('A');
+                static std::once_flag flag; // Send a signal to B only in the first call
+                std::call_once(flag, [&]{ b_ptr->try_put(tbb::flow::continue_msg{}); });
+            });
+
+        tbb::flow::continue_node<tbb::flow::continue_msg> b(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                call_order.push_back('B');
+                a.try_put(tbb::flow::continue_msg{});
+            });
+
+        b_ptr = &b;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg, tbb::flow::lightweight> c(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                call_order.push_back('C');
+            });
+
+        tbb::flow::make_edge(a, c);
+        tbb::flow::make_edge(b, c);
+
+        a.try_put_and_wait(tbb::flow::continue_msg{});
+
+        // Inside the first call of A, we send a signal to B.
+        // Both of them send signals to C. Since C lightweight, it is processed immediately
+        // upon receiving signals from both predecessors. This completes the wait.
+        CHECK(call_order == std::vector<char>{'A', 'B', 'C'});
+
+        g.wait_for_all();
+
+        // B previously sent a signal to A, which has now been processed.
+        // A sends a signal to C, which is not processed because no signal is received from B this time.
+        CHECK(call_order == std::vector<char>{'A', 'B', 'C', 'A'});
+    });
+}
+
+void test_try_put_and_wait() {
+    test_try_put_and_wait_default();
+    test_try_put_and_wait_lightweight();
+    test_metainfo_buffering();
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test concurrent continue_node for correctness
 //! \brief \ref error_guessing
 TEST_CASE("Concurrency testing") {
@@ -418,3 +588,10 @@ TEST_CASE("constraints for continue_node body") {
     static_assert(!can_call_continue_node_ctor<output_type, WrongReturnOperatorRoundBrackets<output_type>>);
 }
 #endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test continue_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_eh_flow_graph.cpp b/test/tbb/test_eh_flow_graph.cpp
index 015d196eaf..160efe90df 100644
--- a/test/tbb/test_eh_flow_graph.cpp
+++ b/test/tbb/test_eh_flow_graph.cpp
@@ -482,6 +482,7 @@ void
 run_one_functype_node_test(bool throwException, bool flog, const char * /*name*/) {
 
     std::stringstream ss;
+    std::string ss_str;
     char *saved_msg = const_cast<char *>(g_Wakeup_Msg);
     tbb::flow::graph g;
 
@@ -511,7 +512,8 @@ run_one_functype_node_test(bool throwException, bool flog, const char * /*name*/
     for(int iter = 0; iter < 2; ++iter) {  // run, reset, run again
         ss.clear();
         ss << saved_msg << " iter=" << iter << ", threads=" << g_NumThreads << ", throw=" << (throwException ? "T" : "F") << ", flow=" << (flog ? "T" : "F");
-        g_Wakeup_Msg = ss.str().c_str();
+        ss_str = ss.str();
+        g_Wakeup_Msg = ss_str.c_str();
         ResetGlobals(throwException,flog);
         if(throwException) {
             TRY();
diff --git a/test/tbb/test_environment_whitebox.cpp b/test/tbb/test_environment_whitebox.cpp
index ecc46e3ac5..9092135da9 100644
--- a/test/tbb/test_environment_whitebox.cpp
+++ b/test/tbb/test_environment_whitebox.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -130,7 +130,7 @@ std::vector<std::pair<std::string, bool>> initialize_cases( bool wrong_result )
     cases.push_back(std::make_pair("1              ", true));
     cases.push_back(std::make_pair("             1           ", true));
     cases.push_back(std::make_pair("         1", true));
-    cases.push_back(std::make_pair((std::string(large_length, ' ') + '1').c_str(), true));
+    cases.push_back(std::make_pair((std::string(large_length, ' ') + '1'), true));
 
     // Invalid cases
     cases.push_back(std::make_pair("", wrong_result));
@@ -150,7 +150,7 @@ std::vector<std::pair<std::string, bool>> initialize_cases( bool wrong_result )
     cases.push_back(std::make_pair("2018", wrong_result));
     cases.push_back(std::make_pair("ABC_123", wrong_result));
     cases.push_back(std::make_pair("true", wrong_result));
-    cases.push_back(std::make_pair(std::string(large_length, 'A').c_str(), wrong_result));
+    cases.push_back(std::make_pair(std::string(large_length, 'A'), wrong_result));
 
     prepare_random_cases(cases);
     return cases;
@@ -162,27 +162,27 @@ std::vector<std::pair<std::string, long>> initialize_cases( long wrong_result )
     // Valid cases
     for (long i = 0; i < 100; ++i) {
         ss << i;
-        cases.push_back(std::make_pair(ss.str().c_str(), i));
+        cases.push_back(std::make_pair(ss.str(), i));
         ss.str("");
 
         ss << "     " << i << "     ";
-        cases.push_back(std::make_pair(ss.str().c_str(), i));
+        cases.push_back(std::make_pair(ss.str(), i));
         ss.str("");
 
         ss << i << "     ";
-        cases.push_back(std::make_pair(ss.str().c_str(), i));
+        cases.push_back(std::make_pair(ss.str(), i));
         ss.str("");
 
         ss << "     " << i;
-        cases.push_back(std::make_pair(ss.str().c_str(), i));
+        cases.push_back(std::make_pair(ss.str(), i));
         ss.str("");
     }
 
     ss << LONG_MAX;
-    cases.push_back(std::make_pair(ss.str().c_str(), LONG_MAX));
+    cases.push_back(std::make_pair(ss.str(), LONG_MAX));
     ss.str("");
 
-    cases.push_back(std::make_pair((std::string(large_length, ' ') + '1').c_str(), 1L));
+    cases.push_back(std::make_pair((std::string(large_length, ' ') + '1'), 1L));
 
     // Invalid cases
     cases.push_back(std::make_pair("", wrong_result));
@@ -202,11 +202,11 @@ std::vector<std::pair<std::string, long>> initialize_cases( long wrong_result )
     cases.push_back(std::make_pair("false", wrong_result));
     cases.push_back(std::make_pair("1A", wrong_result));
     cases.push_back(std::make_pair("_123", wrong_result));
-    cases.push_back(std::make_pair(std::string(large_length, 'A').c_str(), wrong_result));
+    cases.push_back(std::make_pair(std::string(large_length, 'A'), wrong_result));
 
     // Prepare string with LONG_MAX + 1 value
     ss << LONG_MAX / 10 << (LONG_MAX % 10 + 1);
-    cases.push_back(std::make_pair(ss.str().c_str(), -1));
+    cases.push_back(std::make_pair(ss.str(), -1));
     ss.str("");
 
     prepare_random_cases(cases);
diff --git a/test/tbb/test_flow_graph_whitebox.cpp b/test/tbb/test_flow_graph_whitebox.cpp
index a3ed03b252..88365d892d 100644
--- a/test/tbb/test_flow_graph_whitebox.cpp
+++ b/test/tbb/test_flow_graph_whitebox.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -459,7 +459,7 @@ template <>
 struct DecrementerHelper<tbb::flow::continue_msg> {
     template <typename Decrementer>
     static void check(Decrementer& decrementer) {
-        auto& d = static_cast<tbb::detail::d1::continue_receiver&>(decrementer);
+        auto& d = static_cast<tbb::detail::d2::continue_receiver&>(decrementer);
         CHECK_MESSAGE(d.my_predecessor_count == 0, "error in pred count");
         CHECK_MESSAGE(d.my_initial_predecessor_count == 0, "error in initial pred count");
         CHECK_MESSAGE(d.my_current_count == 0, "error in current count");
diff --git a/test/tbb/test_function_node.cpp b/test/tbb/test_function_node.cpp
index aa7e41ca59..999adac189 100644
--- a/test/tbb/test_function_node.cpp
+++ b/test/tbb/test_function_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -469,6 +469,261 @@ void test_follows_and_precedes_api() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+// Basic idea of the following tests is to check that try_put_and_wait(message) call for function_node
+// with one of the policies (lightweight, queueing and rejecting) with different concurrency limits
+// processes all of the previous jobs required to process message, the message itself, but does
+// not process the elements submitted later or not required to process the message
+// These tests submit start_work_items using the regular try_put and then submit wait_message
+// with try_put_and_wait. During the completion of the graph, new_work_items would be submitted
+// once the wait_message arrives.
+void test_try_put_and_wait_lightweight(std::size_t concurrency_limit) {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        using function_node_type = tbb::flow::function_node<int, int, tbb::flow::lightweight>;
+        function_node_type* start_node = nullptr;
+
+        function_node_type function(g, concurrency_limit,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (int item : new_work_items) {
+                        start_node->try_put(item);
+                    }
+                }
+                return input;
+            });
+
+        start_node = &function;
+
+        function_node_type writer(g, concurrency_limit,
+            [&](int input) noexcept {
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(function, writer);
+
+        for (int i = 0; i < wait_message; ++i) {
+            function.try_put(i);
+        }
+
+        function.try_put_and_wait(wait_message);
+
+        std::size_t check_index = 0;
+
+        // For lightweight function_node, start_work_items are expected to be processed first
+        // while putting items into the first node.
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+        }
+
+        if (concurrency_limit == tbb::flow::serial) {
+            // If the lightweight function_node is serial, it should process the wait_message but add items from new_work_items
+            // into the queue since the concurrency limit is occupied.
+            CHECK_MESSAGE(processed_items.size() == start_work_items.size() + 1, "Unexpected number of elements processed");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing");
+        } else {
+            // If the node is unlimited, it should process new_work_items immediately while processing the wait_message
+            // Hence they should be processed before exiting the try_put_and_wait
+            CHECK_MESSAGE(processed_items.size() == start_work_items.size() + new_work_items.size() + 1,
+                          "Unexpected number of elements processed");
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+            }
+            // wait_message would be processed only after new_work_items
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing");
+        }
+
+        g.wait_for_all();
+
+        if (concurrency_limit == tbb::flow::serial) {
+            // For the serial node, processing of new_work_items would be postponed to wait_for_all since they
+            // would be queued and spawned after working with wait_message
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+            }
+        }
+        CHECK(check_index == processed_items.size());
+    });
+}
+
+void test_try_put_and_wait_queueing(std::size_t concurrency_limit) {
+    tbb::task_arena arena(1);
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        using function_node_type = tbb::flow::function_node<int, int, tbb::flow::queueing>;
+        function_node_type* start_node = nullptr;
+
+        function_node_type function(g, concurrency_limit,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (int item : new_work_items) {
+                        start_node->try_put(item);
+                    }
+                }
+                return input;
+            });
+
+        start_node = &function;
+
+        function_node_type writer(g, concurrency_limit,
+            [&](int input) noexcept {
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(function, writer);
+
+        for (int i = 0; i < wait_message; ++i) {
+            function.try_put(i);
+        }
+
+        function.try_put_and_wait(wait_message);
+
+        std::size_t check_index = 0;
+
+        if (concurrency_limit == tbb::flow::serial) {
+            // Serial queueing function_node should add all start_work_items except the first one into the queue
+            // and then process them in FIFO order.
+            // wait_message would also be added to the queue, but would be processed later
+            CHECK_MESSAGE(processed_items.size() == start_work_items.size() + 1, "Unexpected number of elements processed");
+            for (auto item : start_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+            }
+        } else {
+            CHECK_MESSAGE(processed_items.size() == 1, "Unexpected number of elements processed");
+        }
+
+        // For the unlimited function_node, all of the tasks for start_work_items and wait_message would be spawned
+        // and hence processed by the thread in LIFO order.
+        // The first processed item is expected to be wait_message since it was spawned last
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        if (concurrency_limit == tbb::flow::serial) {
+            // For serial queueing function_node, the new_work_items are expected to be processed while calling to wait_for_all
+            // They would be queued and processed later in FIFO order
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+            }
+        } else {
+            // Unlimited function_node would always spawn tasks immediately without adding them into the queue
+            // They would be processed in LIFO order. Hence it is expected that new_work_items would be processed first in reverse order
+            // After them, start_work_items would be processed also in reverse order
+            for (std::size_t i = new_work_items.size(); i != 0; --i) {
+                CHECK_MESSAGE(processed_items[check_index++] == new_work_items[i - 1], "Unexpected items processing");
+            }
+            for (std::size_t i = start_work_items.size(); i != 0; --i) {
+                CHECK_MESSAGE(processed_items[check_index++] == start_work_items[i - 1], "Unexpected items processing");
+            }
+        }
+        CHECK(check_index == processed_items.size());
+    });
+}
+
+void test_try_put_and_wait_rejecting(size_t concurrency_limit) {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+
+        int wait_message = 0;
+
+        for (int i = 1; i < wait_message; ++i) {
+            new_work_items.emplace_back(i);
+        }
+
+        using function_node_type = tbb::flow::function_node<int, int, tbb::flow::rejecting>;
+        function_node_type* start_node = nullptr;
+
+        function_node_type function(g, concurrency_limit,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (int item : new_work_items) {
+                        start_node->try_put(item);
+                    }
+                }
+                return input;
+            });
+
+        start_node = &function;
+
+        function_node_type writer(g, concurrency_limit,
+            [&](int input) noexcept {
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(function, writer);
+
+        // If the first action is try_put_and_wait, it will occupy concurrency of the function_node
+        // All submits of new_work_items inside of the body should be rejected
+        bool result = function.try_put_and_wait(wait_message);
+        CHECK_MESSAGE(result, "task should not rejected since the node concurrency is not saturated");
+
+        CHECK_MESSAGE(processed_items.size() == 1, nullptr);
+        CHECK_MESSAGE(processed_items[0] == wait_message, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        CHECK_MESSAGE(processed_items.size() == 1, nullptr);
+
+        processed_items.clear();
+
+        // If the first action is try_put, try_put_and_wait is expected to return false since the concurrency of the
+        // node would be saturated
+        function.try_put(0);
+        result = function.try_put_and_wait(wait_message);
+        CHECK_MESSAGE(!result, "task should be rejected since the node concurrency is saturated");
+        CHECK(processed_items.empty());
+
+        g.wait_for_all();
+
+        CHECK(processed_items.size() == 1);
+        CHECK_MESSAGE(processed_items[0] == 0, "Unexpected items processing");
+    });
+}
+
+void test_try_put_and_wait() {
+    test_try_put_and_wait_lightweight(tbb::flow::serial);
+    test_try_put_and_wait_lightweight(tbb::flow::unlimited);
+
+    test_try_put_and_wait_queueing(tbb::flow::serial);
+    test_try_put_and_wait_queueing(tbb::flow::unlimited);
+
+    test_try_put_and_wait_rejecting(tbb::flow::serial);
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
 
 //! Test various node bodies with concurrency
 //! \brief \ref error_guessing
@@ -544,3 +799,10 @@ TEST_CASE("constraints for function_node body") {
     static_assert(!can_call_function_node_ctor<input_type, output_type, WrongReturnRoundBrackets<input_type, output_type>>);
 }
 #endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test function_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_indexer_node.cpp b/test/tbb/test_indexer_node.cpp
index 4ce87e195a..c47a8cad01 100644
--- a/test/tbb/test_indexer_node.cpp
+++ b/test/tbb/test_indexer_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -661,6 +661,81 @@ void test_deduction_guides() {
 
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_try_put_and_wait() {
+    tbb::task_arena arena(1);
+
+    arena.execute([] {
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items1;
+        std::vector<float> processed_items2;
+        std::vector<int> new_work_items;
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        tbb::flow::indexer_node<int, float> indexer(g);
+        using output_type = decltype(indexer)::output_type;
+
+        tbb::flow::function_node<output_type, int> function(g, tbb::flow::serial,
+            [&](output_type tag_msg) noexcept {
+                if (tag_msg.tag() == 0) {
+                    int input = tag_msg.cast_to<int>();
+                    if (input == wait_message) {
+                        for (auto item : new_work_items) {
+                            tbb::flow::input_port<0>(indexer).try_put(item);
+                            tbb::flow::input_port<1>(indexer).try_put(float(item));
+                        }
+                    }
+                    processed_items1.emplace_back(input);
+                } else {
+                    processed_items2.emplace_back(tag_msg.cast_to<float>());
+                }
+                return 0;
+            });
+
+        tbb::flow::make_edge(indexer, function);
+
+        for (auto item : start_work_items) {
+            tbb::flow::input_port<0>(indexer).try_put(item);
+            tbb::flow::input_port<1>(indexer).try_put(float(item));
+        }
+
+        tbb::flow::input_port<0>(indexer).try_put_and_wait(wait_message);
+
+        // Since function is a serial queueing function node, all start_work_items would be stored in a queue
+        // wait_message would be stored at the end of the queue
+        // During the try_put_and_wait call, start_work_items would be processed from the queue in FIFO order
+        // wait_message would be processed last and adds new_work_items into the same queue
+        // It is expected then new_work_items would be processed during wait_for_all() call
+
+        std::size_t check_index1 = 0;
+        std::size_t check_index2 = 0;
+
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items1[check_index1++] == item, "Unexpected items processing");
+            CHECK_MESSAGE(processed_items2[check_index2++] == float(item), "Unexpected items processing");
+        }
+
+        // wait_message was submitted only to the first port of indexer_node
+        CHECK_MESSAGE(processed_items1[check_index1++] == wait_message, "Unexpected wait_message processing");
+
+        g.wait_for_all();
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items1[check_index1++] == item, "Unexpected new_work_items processing");
+            CHECK_MESSAGE(processed_items2[check_index2++] == float(item), "Unexpected new_work_items processing");
+        }
+        CHECK((check_index1 == processed_items1.size() && check_index2 == processed_items2.size()));
+    });
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Serial and parallel test on various tuple sizes
 //! \brief \ref error_guessing
 TEST_CASE("Serial and parallel test") {
@@ -712,3 +787,9 @@ TEST_CASE("Deduction guides") {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test indexer_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_input_node.cpp b/test/tbb/test_input_node.cpp
index f27bf71482..9442693980 100644
--- a/test/tbb/test_input_node.cpp
+++ b/test/tbb/test_input_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,8 +30,8 @@
 //! \brief Test for [flow_graph.input_node] specification
 
 
-using tbb::detail::d1::graph_task;
-using tbb::detail::d1::SUCCESSFULLY_ENQUEUED;
+using tbb::detail::d2::graph_task;
+using tbb::detail::d2::SUCCESSFULLY_ENQUEUED;
 
 const int N = 1000;
 
@@ -61,6 +61,12 @@ class test_push_receiver : public tbb::flow::receiver<T>, utils::NoAssign {
         return const_cast<graph_task*>(SUCCESSFULLY_ENQUEUED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task( const T& v, const tbb::detail::d2::message_metainfo& ) override {
+        return try_put_task(v);
+    }
+#endif
+
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
diff --git a/test/tbb/test_join_node.cpp b/test/tbb/test_join_node.cpp
index 2e3af3c547..7f1721e0ee 100644
--- a/test/tbb/test_join_node.cpp
+++ b/test/tbb/test_join_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -154,3 +154,40 @@ TEST_CASE("Test removal of the predecessor while having none") {
 
     test(connect_join_via_make_edge);
 }
+
+//! \brief \ref error_guessing
+TEST_CASE("Test reservation on the port") {
+    tbb::flow::graph g;
+
+    tbb::flow::buffer_node<int> buffer1(g), buffer2(g);
+    tbb::flow::join_node<std::tuple<int, int>, tbb::flow::reserving> join(g);
+    tbb::flow::buffer_node<std::tuple<int, int>> buffer3(g);
+
+    auto& port0 = tbb::flow::input_port<0>(join);
+    auto& port1 = tbb::flow::input_port<1>(join);
+
+    tbb::flow::make_edge(buffer1, port0);
+    tbb::flow::make_edge(buffer2, port1);
+    tbb::flow::make_edge(join, buffer3);
+
+    int value = -42;
+    bool result = port0.reserve(value);
+    CHECK_MESSAGE(!result, "Incorrect reserve return value");
+
+    result = port1.reserve(value);
+    CHECK_MESSAGE(!result, "Incorrect reserve return value");
+
+    buffer1.try_put(1);
+    g.wait_for_all();
+
+    result = port0.reserve(value);
+    CHECK_MESSAGE(result, "Incorrect reserve return value");
+    CHECK_MESSAGE(value == 1, "Incorrect reserved value");
+    port0.release();
+
+    buffer2.try_put(2);
+    g.wait_for_all();
+
+    result = port1.reserve(value);
+    CHECK_MESSAGE(result, "incorrect reserve return value");
+}
diff --git a/test/tbb/test_join_node.h b/test/tbb/test_join_node.h
index 8969634e8a..2216310c1a 100644
--- a/test/tbb/test_join_node.h
+++ b/test/tbb/test_join_node.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -221,7 +221,7 @@ void print_my_value(MyKeySecond<K, V> const &i) {
 
 template<>
 void print_my_value(std::string const &i) {
-    INFO("\"" << i.c_str() << "\"" );
+    INFO("\"" << i << "\"" );
 }
 
 //
@@ -245,10 +245,10 @@ struct my_struct_key<K&, V> {
     }
 };
 
-using tbb::detail::d1::type_to_key_function_body;
-using tbb::detail::d1::hash_buffer;
+using tbb::detail::d2::type_to_key_function_body;
+using tbb::detail::d2::type_to_key_function_body_leaf;
+using tbb::detail::d2::hash_buffer;
 using tbb::detail::d1::tbb_hash_compare;
-using tbb::detail::d1::type_to_key_function_body_leaf;
 
 template<class K, class V> struct VtoKFB {
     typedef type_to_key_function_body<V, K> type;
diff --git a/test/tbb/test_join_node_preview.cpp b/test/tbb/test_join_node_preview.cpp
index 4bcb1900d6..3ee4075794 100644
--- a/test/tbb/test_join_node_preview.cpp
+++ b/test/tbb/test_join_node_preview.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2023 Intel Corporation
+    Copyright (c) 2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -82,6 +82,249 @@ void test_follows_and_precedes_api() {
     jn_msg_key_matching_follows_and_precedes();
 }
 
+void test_try_put_and_wait_queueing() {
+    tbb::task_arena arena(1);
+
+    arena.execute([] {
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        using tuple_type = std::tuple<int, int, int>;
+        tbb::flow::join_node<tuple_type, tbb::flow::queueing> join(g);
+
+        tbb::flow::function_node<tuple_type, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+            [&](tuple_type tuple) noexcept {
+                CHECK(std::get<0>(tuple) == std::get<1>(tuple));
+                CHECK(std::get<1>(tuple) == std::get<2>(tuple));
+
+                auto input = std::get<0>(tuple);
+
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        tbb::flow::input_port<0>(join).try_put(item);
+                        tbb::flow::input_port<1>(join).try_put(item);
+                        tbb::flow::input_port<2>(join).try_put(item);
+                    }
+                }
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(join, function);
+
+        for (auto item : start_work_items) {
+            tbb::flow::input_port<0>(join).try_put(item);
+            tbb::flow::input_port<1>(join).try_put(item);
+            tbb::flow::input_port<2>(join).try_put(item);
+        }
+
+        tbb::flow::input_port<0>(join).try_put(wait_message);
+        tbb::flow::input_port<1>(join).try_put(wait_message);
+        tbb::flow::input_port<2>(join).try_put_and_wait(wait_message);
+
+        // It is expected that the join_node would push the tuple of three copies of first element in start_work_items
+        // And occupy the concurrency of function. Other tuples would be rejected and taken using push-pull protocol
+        // in FIFO order
+        std::size_t check_index = 0;
+
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+        CHECK_MESSAGE(check_index == processed_items.size(), "Unexpected number of messages");
+
+        g.wait_for_all();
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+    });
+}
+
+void test_try_put_and_wait_reserving() {
+    tbb::task_arena arena(1);
+
+    arena.execute([]{
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        using tuple_type = std::tuple<int, int, int>;
+        tbb::flow::queue_node<int> buffer1(g);
+        tbb::flow::queue_node<int> buffer2(g);
+        tbb::flow::queue_node<int> buffer3(g);
+
+        tbb::flow::join_node<tuple_type, tbb::flow::reserving> join(g);
+
+        tbb::flow::function_node<tuple_type, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+            [&](tuple_type tuple) noexcept {
+                CHECK(std::get<0>(tuple) == std::get<1>(tuple));
+                CHECK(std::get<1>(tuple) == std::get<2>(tuple));
+
+                auto input = std::get<0>(tuple);
+
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        buffer1.try_put(item);
+                        buffer2.try_put(item);
+                        buffer3.try_put(item);
+                    }
+                }
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(buffer1, tbb::flow::input_port<0>(join));
+        tbb::flow::make_edge(buffer2, tbb::flow::input_port<1>(join));
+        tbb::flow::make_edge(buffer3, tbb::flow::input_port<2>(join));
+        tbb::flow::make_edge(join, function);
+
+        for (auto item : start_work_items) {
+            buffer1.try_put(item);
+            buffer2.try_put(item);
+            buffer3.try_put(item);
+        }
+
+        buffer1.try_put(wait_message);
+        buffer2.try_put(wait_message);
+        buffer3.try_put_and_wait(wait_message);
+
+        // It is expected that the join_node would push the tuple of three copies of first element in start_work_items
+        // And occupy the concurrency of function. Other tuples would be rejected and taken using push-pull protocol
+        // between function and join_node and between join_node and each buffer in FIFO order because queue_node is used
+        std::size_t check_index = 0;
+
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+        CHECK_MESSAGE(check_index == processed_items.size(), "Unexpected number of messages");
+
+        g.wait_for_all();
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+    });
+}
+
+struct int_wrapper {
+    int i = 0;
+    int_wrapper() : i(0) {}
+    int_wrapper(int ii) : i(ii) {}
+    int_wrapper& operator=(int ii) {
+        i = ii;
+        return *this;
+    }
+
+    int key() const {
+        return i;
+    }
+
+    friend bool operator==(const int_wrapper& lhs, const int_wrapper& rhs) {
+        return lhs.i == rhs.i;
+    }
+};
+
+template <typename... Body>
+void test_try_put_and_wait_key_matching(Body... body) {
+    // Body of one argument for testing standard key_matching
+    // Body of zero arguments for testing message based key_matching
+    static_assert(sizeof...(Body) == 0 || sizeof...(Body) == 1, "incorrect test setup");
+    tbb::task_arena arena(1);
+
+    arena.execute([=] {
+        tbb::flow::graph g;
+
+        std::vector<int_wrapper> start_work_items;
+        std::vector<int_wrapper> processed_items;
+        std::vector<int_wrapper> new_work_items;
+        int_wrapper wait_message = 10;
+
+        for (int i = 0; i < wait_message.i; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message.i);
+        }
+
+        using tuple_type = std::tuple<int_wrapper, int_wrapper, int_wrapper>;
+        tbb::flow::join_node<tuple_type, tbb::flow::key_matching<int>> join(g, body..., body..., body...);
+
+        tbb::flow::function_node<tuple_type, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+            [&](tuple_type tuple) noexcept {
+                CHECK(std::get<0>(tuple) == std::get<1>(tuple));
+                CHECK(std::get<1>(tuple) == std::get<2>(tuple));
+
+                auto input = std::get<0>(tuple);
+
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        tbb::flow::input_port<0>(join).try_put(item);
+                        tbb::flow::input_port<1>(join).try_put(item);
+                        tbb::flow::input_port<2>(join).try_put(item);
+                    }
+                }
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(join, function);
+
+        tbb::flow::input_port<0>(join).try_put(wait_message);
+        tbb::flow::input_port<1>(join).try_put(wait_message);
+
+        // For the first port - submit items in reversed order
+        for (std::size_t i = start_work_items.size(); i != 0; --i) {
+            tbb::flow::input_port<0>(join).try_put(start_work_items[i - 1]);
+        }
+
+        // For first two ports - submit items in direct order
+        for (auto item : start_work_items) {
+            tbb::flow::input_port<1>(join).try_put(item);
+            tbb::flow::input_port<2>(join).try_put(item);
+        }
+
+        tbb::flow::input_port<2>(join).try_put_and_wait(wait_message);
+
+        // It is expected that the join_node would push the tuple of three copies of first element in start_work_items
+        // And occupy the concurrency of function. Other tuples would be rejected and taken using push-pull protocol
+        // in order of submission
+        std::size_t check_index = 0;
+
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+        CHECK_MESSAGE(check_index == processed_items.size(), "Unexpected number of messages");
+
+        g.wait_for_all();
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+        CHECK_MESSAGE(check_index == processed_items.size(), "Unexpected number of messages");
+    });
+}
+
 //! Test follows and precedes API
 //! \brief \ref error_guessing
 TEST_CASE("Test follows and precedes API"){
@@ -101,3 +344,13 @@ TEST_CASE("Test removal of the predecessor while having none") {
     test(connect_join_via_follows);
     test(connect_join_via_precedes);
 }
+
+//! \brief \ref error_guessing
+TEST_CASE("Test join_node try_put_and_wait") {
+    test_try_put_and_wait_queueing();
+    test_try_put_and_wait_reserving();
+    // Test standard key_matching policy
+    test_try_put_and_wait_key_matching([](int_wrapper w) { return w.i; });
+    // Test msg based key_matching policy
+    test_try_put_and_wait_key_matching();
+}
diff --git a/test/tbb/test_limiter_node.cpp b/test/tbb/test_limiter_node.cpp
index 897f840d36..0bf4912f8a 100644
--- a/test/tbb/test_limiter_node.cpp
+++ b/test/tbb/test_limiter_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -38,8 +38,8 @@
 const int L = 10;
 const int N = 1000;
 
-using tbb::detail::d1::SUCCESSFULLY_ENQUEUED;
-using tbb::detail::d1::graph_task;
+using tbb::detail::d2::SUCCESSFULLY_ENQUEUED;
+using tbb::detail::d2::graph_task;
 
 template< typename T >
 struct serial_receiver : public tbb::flow::receiver<T>, utils::NoAssign {
@@ -53,6 +53,12 @@ struct serial_receiver : public tbb::flow::receiver<T>, utils::NoAssign {
        return const_cast<graph_task*>(SUCCESSFULLY_ENQUEUED);
    }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task * try_put_task( const T &v, const tbb::detail::d2::message_metainfo& ) override {
+        return try_put_task(v);
+    }
+#endif
+
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
@@ -71,6 +77,12 @@ struct parallel_receiver : public tbb::flow::receiver<T>, utils::NoAssign {
        return const_cast<graph_task*>(SUCCESSFULLY_ENQUEUED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task * try_put_task( const T &v, const tbb::detail::d2::message_metainfo& ) override {
+        return try_put_task(v);
+    }
+#endif
+
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
@@ -534,6 +546,67 @@ void test_decrement_while_try_put_task() {
     CHECK_MESSAGE(processed.load() == threshold, "decrementer terminate flow graph work");
 }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_try_put_and_wait() {
+    tbb::task_arena arena(1);
+
+    arena.execute([] {
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        std::size_t threshold = start_work_items.size() + 1;
+        CHECK_MESSAGE(new_work_items.size() < threshold, "Incorrect test setup");
+
+        tbb::flow::limiter_node<int> limiter(g, threshold);
+        tbb::flow::function_node<int, tbb::flow::continue_msg> function(g, tbb::flow::serial,
+            [&](int input) {
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        limiter.try_put(item);
+                    }
+                }
+                processed_items.emplace_back(input);
+            });
+
+        tbb::flow::make_edge(limiter, function);
+        tbb::flow::make_edge(function, limiter.decrementer());
+
+        for (auto item : start_work_items) {
+            limiter.try_put(item);
+        }
+
+        limiter.try_put_and_wait(wait_message);
+
+        // Since function is a serial queueing function_node, all start_work_items would be added to the queue
+        // and processed in FIFO order. wait_message would be added and processed last. Each item in start_work_items
+        // should put an item to a decrementer edge and hence new_work_items should not be missed as well
+
+        std::size_t check_index = 0;
+
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+        CHECK_MESSAGE(check_index == processed_items.size(), "Unexpected number of messages");
+
+        g.wait_for_all();
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected new_work_items processing");
+        }
+        CHECK(check_index == processed_items.size());
+    });
+}
+#endif
 
 //! Test puts on limiter_node with decrements and varying parallelism levels
 //! \brief \ref error_guessing
@@ -623,3 +696,10 @@ TEST_CASE("Test correct node deallocation while using small_object_pool") {
     tbb::task_scheduler_handle handle{ tbb::attach{} };
     tbb::finalize( handle, std::nothrow );
 }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test limiter_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_overwrite_node.cpp b/test/tbb/test_overwrite_node.cpp
index 127cca2d15..3f5ed8fec0 100644
--- a/test/tbb/test_overwrite_node.cpp
+++ b/test/tbb/test_overwrite_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include "common/graph_utils.h"
 #include "common/test_follows_and_precedes_api.h"
 
+#include "test_buffering_try_put_and_wait.h"
 
 //! \file test_overwrite_node.cpp
 //! \brief Test for [flow_graph.overwrite_node] specification
@@ -183,6 +184,165 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_overwrite_node_try_put_and_wait() {
+    using namespace test_try_put_and_wait;
+
+    std::vector<int> start_work_items;
+    std::vector<int> new_work_items;
+    int wait_message = 10;
+
+    for (int i = 0; i < wait_message; ++i) {
+        start_work_items.emplace_back(i);
+        new_work_items.emplace_back(i + 1 + wait_message);
+    }
+
+    // Test push
+    {
+        std::vector<int> processed_items;
+
+        // Returns the index from which wait_for_all processing started
+        std::size_t after_start = test_buffer_push<tbb::flow::overwrite_node<int>>(start_work_items, wait_message,
+                                                                                   new_work_items, processed_items);
+
+        // It is expected that try_put_and_wait would process start_work_items (FIFO) and the wait_message
+        // and new_work_items (FIFO) would be processed in wait_for_all
+
+        CHECK_MESSAGE(after_start - 1 == start_work_items.size() + 1,
+                      "incorrect number of items processed by try_put_and_wait");
+        std::size_t check_index = 0;
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "unexpected start_work_items processing");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "unexpected wait_message processing");
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "unexpected new_work_items processing");
+        }
+    }
+    // Test pull
+    {
+        tbb::task_arena arena(1);
+
+        arena.execute([&] {
+            std::vector<int> processed_items;
+
+            tbb::flow::graph g;
+            tbb::flow::overwrite_node<int> buffer(g);
+            int start_message = 0;
+            int new_message = 1;
+
+            using function_node_type = tbb::flow::function_node<int, int, tbb::flow::rejecting>;
+
+            function_node_type function(g, tbb::flow::serial,
+                [&](int input) {
+                    if (input == wait_message) {
+                        buffer.try_put(new_message);
+                    }
+
+                    // Explicitly clean the buffer to prevent infinite try_get by the function_node
+                    if (input == new_message) {
+                        buffer.clear();
+                    }
+
+                    processed_items.emplace_back(input);
+                    return 0;
+                });
+
+            tbb::flow::make_edge(buffer, function);
+
+            buffer.try_put(start_message); // Occupies concurrency of function
+
+            buffer.try_put_and_wait(wait_message);
+
+            CHECK_MESSAGE(processed_items.size() == 2, "only the start_message and wait_message should be processed");
+            std::size_t check_index = 0;
+            CHECK_MESSAGE(processed_items[check_index++] == start_message, "unexpected start_message processing");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "unexpected wait_message processing");
+
+            g.wait_for_all();
+
+            CHECK_MESSAGE(processed_items[check_index++] == new_message, "unexpected new_message processing");
+            CHECK(check_index == processed_items.size());
+        });
+    }
+    // Test reserve
+    {
+        tbb::task_arena arena(1);
+
+        arena.execute([&] {
+            std::vector<int> processed_items;
+
+            tbb::flow::graph g;
+            tbb::flow::overwrite_node<int> buffer(g);
+            tbb::flow::limiter_node<int, int> limiter(g, 1);
+            int start_message = 0;
+            int new_message = 1;
+
+            using function_node_type = tbb::flow::function_node<int, int, tbb::flow::rejecting>;
+
+            function_node_type function(g, tbb::flow::serial,
+                [&](int input) {
+                    if (input == wait_message) {
+                        buffer.try_put(new_message);
+                    }
+
+                    // Explicitly clean the buffer to prevent infinite try_get by the function_node
+                    if (input == new_message) {
+                        buffer.clear();
+                    }
+
+                    processed_items.emplace_back(input);
+                    limiter.decrementer().try_put(1);
+                    return 0;
+                });
+
+            tbb::flow::make_edge(buffer, limiter);
+            tbb::flow::make_edge(limiter, function);
+
+            buffer.try_put(start_message); // Occupies concurrency of function
+
+            buffer.try_put_and_wait(wait_message);
+
+            CHECK_MESSAGE(processed_items.size() == 2, "only the start_message and wait_message should be processed");
+            std::size_t check_index = 0;
+            CHECK_MESSAGE(processed_items[check_index++] == start_message, "unexpected start_message processing");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "unexpected wait_message processing");
+
+            g.wait_for_all();
+
+            CHECK_MESSAGE(processed_items[check_index++] == new_message, "unexpected new_message processing");
+            CHECK(check_index == processed_items.size());
+        });
+    }
+    // Test explicit clear
+    {
+        tbb::flow::graph g;
+        tbb::flow::overwrite_node<int> buffer(g);
+
+        std::vector<int> processed_items;
+
+        tbb::flow::function_node<int, int> f(g, tbb::flow::serial,
+            [&](int input) {
+                processed_items.emplace_back(input);
+                buffer.clear();
+                return 0;
+            });
+
+        tbb::flow::make_edge(buffer, f);
+
+        buffer.try_put_and_wait(wait_message);
+
+        CHECK_MESSAGE(processed_items.size() == 1, "Incorrect number of processed items");
+        CHECK_MESSAGE(processed_items.back() == wait_message, "unexpected processing");
+
+        g.wait_for_all();
+
+        CHECK(processed_items.size() == 1);
+        CHECK(processed_items.back() == wait_message);
+    }
+}
+#endif
+
 //! Test read-write properties
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Read-write"){
@@ -256,3 +416,10 @@ TEST_CASE("Cancel register_predecessor_task") {
     // Wait for cancellation of spawned tasks
     g.wait_for_all();
 }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test overwrite_node try_put_and_wait") {
+    test_overwrite_node_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_partitioner.cpp b/test/tbb/test_partitioner.cpp
index 9af5009dad..e0fb98fc28 100644
--- a/test/tbb/test_partitioner.cpp
+++ b/test/tbb/test_partitioner.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2021-2023 Intel Corporation
+    Copyright (c) 2021-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include "tbb/parallel_for.h"
 #include "tbb/task_arena.h"
+#include "tbb/task_scheduler_observer.h"
 #include "tbb/global_control.h"
 #include "oneapi/tbb/mutex.h"
 
@@ -36,10 +37,33 @@
 
 namespace task_affinity_retention {
 
+class leaving_observer : public tbb::task_scheduler_observer {
+    std::atomic<int> my_thread_count{};
+public:
+    leaving_observer(tbb::task_arena& a) : tbb::task_scheduler_observer(a) {
+        observe(true);
+    }
+
+    void on_scheduler_entry(bool) override {
+        ++my_thread_count;
+    }
+
+    void on_scheduler_exit(bool) override {
+        --my_thread_count;
+    }
+
+    void wait_leave() {
+        while (my_thread_count.load() != 0) {
+            std::this_thread::yield();
+        }
+    }
+};
+
 template <typename PerBodyFunc> float test(PerBodyFunc&& body) {
     const std::size_t num_threads = 2 * utils::get_platform_max_threads();
     tbb::global_control concurrency(tbb::global_control::max_allowed_parallelism, num_threads);
     tbb::task_arena big_arena(static_cast<int>(num_threads));
+    leaving_observer observer(big_arena);
 
 #if __TBB_USE_THREAD_SANITIZER
     // Reduce execution time under Thread Sanitizer
@@ -77,8 +101,10 @@ template <typename PerBodyFunc> float test(PerBodyFunc&& body) {
                 tbb::static_partitioner()
             );
         });
-        // TODO:
-        //   - Consider introducing an observer to guarantee the threads left the arena.
+        // To avoid tasks stealing in the beginning of the parallel algorithm, the test waits for
+        // the threads to leave the arena, so that on the next iteration they have tasks assigned
+        // in their mailboxes and, thus, don't need to search for work to do in other task pools.
+        observer.wait_leave();
     }
 
     std::size_t range_shifts = 0;
@@ -142,12 +168,15 @@ void strict_test() {
 
 } // namespace task_affinity_retention
 
+// global_control::max_allowed_parallelism functionality is not covered by TCM
+#if !__TBB_TCM_TESTING_ENABLED
 //! Testing affinitized tasks are not stolen
 //! \brief \ref error_guessing
 TEST_CASE("Threads respect task affinity") {
     task_affinity_retention::relaxed_test();
     task_affinity_retention::strict_test();
 }
+#endif
 
 template <typename Range>
 void test_custom_range(int diff_mult) {
diff --git a/test/tbb/test_priority_queue_node.cpp b/test/tbb/test_priority_queue_node.cpp
index d14aa4bbb3..18a60eb935 100644
--- a/test/tbb/test_priority_queue_node.cpp
+++ b/test/tbb/test_priority_queue_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 
 #include <cstdio>
 
+#include "test_buffering_try_put_and_wait.h"
 
 //! \file test_priority_queue_node.cpp
 //! \brief Test for [flow_graph.priority_queue_node] specification
@@ -378,6 +379,166 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_pqueue_node_try_put_and_wait() {
+    using namespace test_try_put_and_wait;
+
+    std::vector<int> start_work_items;
+    std::vector<int> new_work_items;
+    int wait_message = -10;
+
+    for (int i = 0; i < 10; ++i) {
+        start_work_items.emplace_back(i);
+        new_work_items.emplace_back(i + 1 + wait_message);
+    }
+
+    // Test push
+    // test_buffer_push tests the graph
+    // buffer1 -> function -> buffer2 -> writer
+    //     function is a queueing serial function_node that submits new_work_items once wait_message arrives
+    //     writer is an unlimited function_node that writes an item into the processed_items vector
+    // Test steps
+    //     1. push start_work_items into the buffer1
+    //     2. buffer1.try_put_and_wait(wait_message);
+    //     3. g.wait_for_all()
+    // test_buffer_push returns the index from which the items processed during wait_for_all() starts
+    {
+        std::vector<int> processed_items;
+
+        std::size_t after_start = test_buffer_push<tbb::flow::priority_queue_node<int>>(start_work_items, wait_message,
+                                                                                        new_work_items, processed_items);
+
+        // Expected effect:
+        // During buffer1.try_put_and_wait()
+        //     1. start_work_items would be pushed to buffer1
+        //     2. wait_message would be pushed to buffer1
+        //     3. forward_task on buffer1 would transfer start_work_items into the function_node in LIFO order
+        //     4. wait_message would be transferred last because of lowest priority
+        //     5. the first item would occupy concurrency of function, other items would be pushed to the queue
+        //     6. function would process start_work_items and push them to the buffer2
+        //     7. wait_message would be processed last and add new_work_items to buffer1
+        //     8. forward_task on buffer2 would transfer start_work_items in FIFO order and the wait_message to the writer
+        //     9.  try_put_and_wait exits since wait_message is completed
+        // During g.wait_for_all()
+        //     10. forward_task for new_work_items in buffer1 would be spawned and put items in function in LIFO order
+        // Expected items processing - { start_work_items LIFO, wait_message, new_work_items LIFO }
+
+        std::size_t check_index = 0;
+        CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                      "try_put_and_wait should process start_work_items and the wait_message");
+        for (std::size_t i = start_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items[check_index++] == start_work_items[i - 1],
+                          "try_put_and_wait should process start_work_items in LIFO order");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                      "try_put_and_wait should process wait_message after start_work_items");
+
+        for (std::size_t i = new_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items[check_index++] == new_work_items[i - 1],
+                          "wait_for_all should process new_work_items in LIFO order");
+        }
+        CHECK(check_index == processed_items.size());
+    } // Test push
+
+    // Test pull
+    // test_buffer_pull tests the graph
+    // buffer -> function
+    //     function is a rejecting serial function_node that submits new_work_items once wait_message arrives
+    //     and writes the processed item into the processed_items
+    // Test steps
+    //     1. push the occupier message to the function
+    //     2. push start_work_items into the buffer
+    //     3. buffer.try_put_and_wait(wait_message)
+    //     4. g.wait_for_all()
+    // test_buffer_pull returns the index from which the items processed during wait_for_all() starts
+
+    {
+        std::vector<int> processed_items;
+        int occupier = 42;
+
+        std::size_t after_start = test_buffer_pull<tbb::flow::priority_queue_node<int>>(start_work_items, wait_message, occupier,
+                                                                                        new_work_items, processed_items);
+
+        // Expected effect
+        // 0. task for occupier processing would be spawned by the function
+        // During buffer.try_put_and_wait()
+        //     1. start_work_items would be pushed to the buffer
+        //     2. wait_message would be pushed to the buffer
+        //     3. forward_task would try to push items to the function, but would fail
+        //        and set the edge to the pull state
+        //     4. occupier would be processed
+        //     5. items would be taken from the buffer by function in the priority (LIFO)  order
+        //     6. wait_message would be taken last due to lowest priority
+        //     7. new_work_items would be pushed to the buffer while processing wait_message
+        // During wait_for_all()
+        //     8. new_work_items would be taken from the buffer in the priority (LIFO) order
+        // Expected items processing { occupier, start_work_items LIFO, wait_message, new_work_items LIFO }
+
+        std::size_t check_index = 0;
+        CHECK_MESSAGE(after_start == start_work_items.size() + 2,
+                      "try_put_and_wait should process start_work_items, occupier and the wait_message");
+        CHECK_MESSAGE(processed_items[check_index++] == occupier, "try_put_and_wait should process the occupier");
+        for (std::size_t i = start_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items[check_index++] == start_work_items[i - 1],
+                          "try_put_and_wait should process start_work_items in LIFO order");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                      "try_put_and_wait should process wait_message after start_work_items");
+
+        for (std::size_t i = new_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items[check_index++] == new_work_items[i - 1],
+                          "wait_for_all should process new_work_items in LIFO order");
+        }
+        CHECK(check_index == processed_items.size());
+    }
+
+    // Test reserve
+    {
+        int thresholds[] = { 1, 2 };
+
+        for (int threshold : thresholds) {
+            std::vector<int> processed_items;
+
+            // test_buffer_reserve tests the following graph
+            // buffer -> limiter -> function
+            //  function is a rejecting serial function_node that puts an item to the decrementer port
+            //  of the limiter inside of the body
+
+            std::size_t after_start = test_buffer_reserve<tbb::flow::priority_queue_node<int>>(threshold,
+                start_work_items, wait_message, new_work_items, processed_items);
+
+            // Expected effect:
+            // 1. start_work_items would be pushed to the buffer
+            // 2. wait_message_would be pushed to the buffer
+            // 3. forward task of the buffer would push the first message to the limiter node.
+            //    Since the limiter threshold is not reached, it would be directly passed to the function
+            // 4. function would spawn the task for the first message processing
+            // 5. the first would be processed
+            // 6. decrementer.try_put() would be called and the limiter node would
+            //    process all of the items from the buffer using the try_reserve/try_consume/try_release semantics
+            //    in the priority (greatest first) order
+            // 7. When the wait_message would be taken from the queue, the try_put_and_wait would exit
+
+            std::size_t check_index = 0;
+
+            CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                          "try_put_and_wait should start_work_items and wait_message");
+            for (std::size_t index = start_work_items.size(); index != 0; --index) {
+                CHECK_MESSAGE(processed_items[check_index++] == start_work_items[index - 1],
+                              "Unexpected start_work_items processing");
+            }
+
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+
+            for (std::size_t index = new_work_items.size(); index != 0; --index) {
+                CHECK_MESSAGE(processed_items[check_index++] == new_work_items[index - 1],
+                              "Unexpected new_work_items processing");
+            }
+        }
+    }
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test serial, parallel behavior and reservation under parallelism
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Serial, parallel and reservation tests"){
@@ -419,3 +580,9 @@ TEST_CASE("Test deduction guides"){
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test priority_queue_node try_put_and_wait") {
+    test_pqueue_node_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_queue_node.cpp b/test/tbb/test_queue_node.cpp
index e034ef6645..546b47edae 100644
--- a/test/tbb/test_queue_node.cpp
+++ b/test/tbb/test_queue_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 
 #include <cstdio>
 
+#include "test_buffering_try_put_and_wait.h"
 
 //! \file test_queue_node.cpp
 //! \brief Test for [flow_graph.queue_node] specification
@@ -494,6 +495,162 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_queue_node_try_put_and_wait() {
+    using namespace test_try_put_and_wait;
+
+    std::vector<int> start_work_items;
+    std::vector<int> new_work_items;
+    int wait_message = 10;
+
+    for (int i = 0; i < wait_message; ++i) {
+        start_work_items.emplace_back(i);
+        new_work_items.emplace_back(i + 1 + wait_message);
+    }
+
+    // Test push
+    // test_buffer_push tests the graph
+    // buffer1 -> function -> buffer2 -> writer
+    //     function is a queueing serial function_node that submits new_work_items once wait_message arrives
+    //     writer is an unlimited function_node that writes an item into the processed_items vector
+    // Test steps
+    //     1. push start_work_items into the buffer1
+    //     2. buffer1.try_put_and_wait(wait_message);
+    //     3. g.wait_for_all()
+    // test_buffer_push returns the index from which the items processed during wait_for_all() starts
+    {
+        std::vector<int> processed_items;
+
+        std::size_t after_start = test_buffer_push<tbb::flow::queue_node<int>>(start_work_items, wait_message,
+                                                                               new_work_items, processed_items);
+
+        // Expected effect:
+        // During buffer1.try_put_and_wait()
+        //     1. start_work_items would be pushed to buffer1
+        //     2. wait_message would be pushed to buffer1
+        //     3. forward_task on buffer1 would transfer all of the items to the function_node in FIFO order
+        //     4. the first item would occupy concurrency of function, other items would be pushed to the queue
+        //     5. function would process start_work_items and push them to the buffer2
+        //     6. wait_message would be processed last and add new_work_items to buffer1
+        //     7. forward_task on buffer2 would transfer start_work_items in FIFO order and the wait_message to the writer
+        //     8.  try_put_and_wait exits since wait_message is completed
+        // During g.wait_for_all()
+        //     10. forward_task for new_work_items in buffer1 would be spawned and put items in function in FIFO order
+        //     11. function_node would process and push forward items from the queue in FIFO order
+        // Expected items processing - { start_work_items FIFO, wait_message, new_work_items FIFO }
+
+        std::size_t check_index = 0;
+        CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                      "try_put_and_wait should process start_work_items and the wait_message");
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process start_work_items FIFO");
+        }
+
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                      "try_put_and_wait should process wait_message after start_work_items");
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "wait_for_all should process new_work_items FIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    } // Test push
+
+    // Test pull
+    // test_buffer_pull tests the graph
+    // buffer -> function
+    //     function is a rejecting serial function_node that submits new_work_items once wait_message arrives
+    //     and writes the processed item into the processed_items
+    // Test steps
+    //     1. push the occupier message to the function
+    //     2. push start_work_items into the buffer
+    //     3. buffer.try_put_and_wait(wait_message)
+    //     4. g.wait_for_all()
+    // test_buffer_pull returns the index from which the items processed during wait_for_all() starts
+
+    {
+        std::vector<int> processed_items;
+        int occupier = 42;
+
+        std::size_t after_start = test_buffer_pull<tbb::flow::queue_node<int>>(start_work_items, wait_message, occupier,
+                                                                               new_work_items, processed_items);
+
+        // Expected effect
+        // 0. task for occupier processing would be spawned by the function
+        // During buffer.try_put_and_wait()
+        //     1. start_work_items would be pushed to the buffer
+        //     2. wait_message would be pushed to the buffer
+        //     3. forward_task would try to push items to the function, but would fail
+        //        and set the edge to the pull state
+        //     4. occupier would be processed
+        //     5. items would be taken from the buffer by function in FIFO order
+        //     6. wait_message would be taken last and push new_work_items to the buffer
+        // During wait_for_all()
+        //     7. new_work_items would be taken from the buffer in FIFO order
+        // Expected items processing { occupier, start_work_items FIFO, wait_message, new_work_items FIFO }
+
+        std::size_t check_index = 0;
+
+        CHECK_MESSAGE(after_start == start_work_items.size() + 2,
+                      "start_work_items, occupier and wait_message should be processed by try_put_and_wait");
+        CHECK_MESSAGE(processed_items[check_index++] == occupier, "Unexpected items processing by try_put_and_wait");
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process start_work_items FIFO");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing by try_put_and_wait");
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process new_work_items FIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    }
+
+    // Test reserve
+    {
+        int thresholds[] = { 1, 2 };
+
+        for (int threshold : thresholds) {
+            std::vector<int> processed_items;
+
+            // test_buffer_reserve tests the following graph
+            // buffer -> limiter -> function
+            //  function is a rejecting serial function_node that puts an item to the decrementer port
+            //  of the limiter inside of the body
+
+            std::size_t after_start = test_buffer_reserve<tbb::flow::queue_node<int>>(threshold,
+                start_work_items, wait_message, new_work_items, processed_items);
+
+            // Expected effect:
+            // 1. start_work_items would be pushed to the buffer
+            // 2. wait_message_would be pushed to the buffer
+            // 3. forward task of the buffer would push the first message to the limiter node.
+            //    Since the limiter threshold is not reached, it would be directly passed to the function
+            // 4. function would spawn the task for the first message processing
+            // 5. the first would be processed
+            // 6. decrementer.try_put() would be called and the limiter node would
+            //    process all of the items from the buffer using the try_reserve/try_consume/try_release semantics
+            // 7. When the wait_message would be taken from the queue, the try_put_and_wait would exit
+
+            std::size_t check_index = 0;
+
+            CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                          "try_put_and_wait should start_work_items and wait_message");
+            for (auto item : start_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+            }
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+            }
+        }
+    }
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test serial, parallel behavior and reservation under parallelism
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Parallel, serial test"){
@@ -559,3 +716,10 @@ TEST_CASE("queue_node with reservation"){
     CHECK_MESSAGE((out_arg == -1), "Getting from reserved node should not update its argument.");
     g.wait_for_all();
 }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test queue_node try_put_and_wait") {
+    test_queue_node_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_sequencer_node.cpp b/test/tbb/test_sequencer_node.cpp
index 564721f682..1e6494d69b 100644
--- a/test/tbb/test_sequencer_node.cpp
+++ b/test/tbb/test_sequencer_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 #include <cstdio>
 #include <atomic>
 
+#include "test_buffering_try_put_and_wait.h"
 
 //! \file test_sequencer_node.cpp
 //! \brief Test for [flow_graph.sequencer_node] specification
@@ -437,6 +438,169 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_seq_node_try_put_and_wait() {
+    using namespace test_try_put_and_wait;
+
+    std::vector<int> start_work_items;
+    std::vector<int> new_work_items;
+    int wait_message = 10;
+
+    for (int i = 0; i < wait_message; ++i) {
+        start_work_items.emplace_back(i);
+        new_work_items.emplace_back(i + 1 + wait_message);
+    }
+
+    auto simple_sequencer = [](int item) { return item; };
+
+    // Test push
+    // test_buffer_push tests the graph
+    // buffer1 -> function -> buffer2 -> writer
+    //     function is a queueing serial function_node that submits new_work_items once wait_message arrives
+    //     writer is an unlimited function_node that writes an item into the processed_items vector
+    // Test steps
+    //     1. push start_work_items into the buffer1
+    //     2. buffer1.try_put_and_wait(wait_message);
+    //     3. g.wait_for_all()
+    // test_buffer_push returns the index from which the items processed during wait_for_all() starts
+    {
+        std::vector<int> processed_items;
+
+        std::size_t after_start = test_buffer_push<tbb::flow::sequencer_node<int>>(start_work_items, wait_message,
+                                                                                   new_work_items, processed_items,
+                                                                                   simple_sequencer);
+
+        // Expected effect:
+        // During buffer1.try_put_and_wait()
+        //     1. start_work_items would be pushed to buffer1
+        //     2. wait_message would be pushed to buffer1
+        //     3. forward_task on buffer1 would transfer all of the items to the function_node in sequencer order (FIFO)
+        //     4. the first item would occupy concurrency of function, other items would be pushed to the queue
+        //     5. function would process start_work_items and push them to the buffer2
+        //     6. wait_message would be processed last and add new_work_items to buffer1
+        //     7. forward_task on buffer2 would transfer start_work_items in sequencer (FIFO) order and the wait_message to the writer
+        //     8.  try_put_and_wait exits since wait_message is completed
+        // During g.wait_for_all()
+        //     10. forward_task for new_work_items in buffer1 would be spawned and put items in function in FIFO order
+        //     11. function_node would process and push forward items from the queue in FIFO order
+        // Expected items processing - { start_work_items FIFO, wait_message, new_work_items FIFO }
+
+        std::size_t check_index = 0;
+        CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                      "try_put_and_wait should process start_work_items and the wait_message");
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process start_work_items FIFO");
+        }
+
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                      "try_put_and_wait should process wait_message after start_work_items");
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "wait_for_all should process new_work_items FIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    } // Test push
+
+    // Test pull
+    // test_buffer_pull tests the graph
+    // buffer -> function
+    //     function is a rejecting serial function_node that submits new_work_items once wait_message arrives
+    //     and writes the processed item into the processed_items
+    // Test steps
+    //     1. push the occupier message to the function
+    //     2. push start_work_items into the buffer
+    //     3. buffer.try_put_and_wait(wait_message)
+    //     4. g.wait_for_all()
+    // test_buffer_pull returns the index from which the items processed during wait_for_all() starts
+
+    {
+        std::vector<int> processed_items;
+        int occupier = 42;
+
+        std::size_t after_start = test_buffer_pull<tbb::flow::sequencer_node<int>>(start_work_items, wait_message, occupier,
+                                                                                   new_work_items, processed_items,
+                                                                                   simple_sequencer);
+
+        // Expected effect
+        // 0. task for occupier processing would be spawned by the function
+        // During buffer.try_put_and_wait()
+        //     1. start_work_items would be pushed to the buffer
+        //     2. wait_message would be pushed to the buffer
+        //     3. forward_task would try to push items to the function, but would fail
+        //        and set the edge to the pull state
+        //     4. occupier would be processed
+        //     5. items would be taken from the buffer by function in FIFO order
+        //     6. wait_message would be taken last and push new_work_items to the buffer
+        // During wait_for_all()
+        //     7. new_work_items would be taken from the buffer in FIFO order
+        // Expected items processing { occupier, start_work_items FIFO, wait_message, new_work_items FIFO }
+
+        std::size_t check_index = 0;
+
+        CHECK_MESSAGE(after_start == start_work_items.size() + 2,
+                      "start_work_items, occupier and wait_message should be processed by try_put_and_wait");
+        CHECK_MESSAGE(processed_items[check_index++] == occupier, "Unexpected items processing by try_put_and_wait");
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process start_work_items FIFO");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing by try_put_and_wait");
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process new_work_items FIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    }
+
+    // Test reserve
+    {
+        int thresholds[] = { 1, 2 };
+
+        for (int threshold : thresholds) {
+            std::vector<int> processed_items;
+
+            // test_buffer_reserve tests the following graph
+            // buffer -> limiter -> function
+            //  function is a rejecting serial function_node that puts an item to the decrementer port
+            //  of the limiter inside of the body
+
+            std::size_t after_start = test_buffer_reserve<tbb::flow::sequencer_node<int>>(threshold,
+                start_work_items, wait_message, new_work_items, processed_items, simple_sequencer);
+
+            // Expected effect:
+            // 1. start_work_items would be pushed to the buffer
+            // 2. wait_message_would be pushed to the buffer
+            // 3. forward task of the buffer would push the first message to the limiter node.
+            //    Since the limiter threshold is not reached, it would be directly passed to the function
+            // 4. function would spawn the task for the first message processing
+            // 5. the first would be processed
+            // 6. decrementer.try_put() would be called and the limiter node would
+            //    process all of the items from the buffer using the try_reserve/try_consume/try_release semantics
+            // 7. When the wait_message would be taken from the buffer, the try_put_and_wait would exit
+
+            std::size_t check_index = 0;
+
+            CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                      "start_work_items, occupier and wait_message should be processed by try_put_and_wait");
+            for (auto item : start_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item,
+                            "try_put_and_wait should process start_work_items FIFO");
+            }
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing by try_put_and_wait");
+
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item,
+                            "try_put_and_wait should process new_work_items FIFO");
+            }
+            CHECK(check_index == processed_items.size());
+        }
+    }
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test sequencer with various request orders and parallelism levels
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Serial and parallel test"){
@@ -501,3 +665,10 @@ TEST_CASE("constraints for sequencer_node sequencer") {
     static_assert(!can_call_sequencer_node_ctor<type, WrongReturnOperatorRoundBrackets<type>>);
 }
 #endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test sequencer_node try_put_and_wait") {
+    test_seq_node_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_split_node.cpp b/test/tbb/test_split_node.cpp
index e791b546b5..1e03be0dab 100644
--- a/test/tbb/test_split_node.cpp
+++ b/test/tbb/test_split_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -397,6 +397,83 @@ void test_deduction_guides() {
 
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_try_put_and_wait() {
+    tbb::task_arena arena(1);
+
+    arena.execute([] {
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items1;
+        std::vector<int> processed_items2;
+        std::vector<int> new_work_items;
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        using tuple_type = std::tuple<int, int>;
+        tbb::flow::split_node<tuple_type> split(g);
+
+        tbb::flow::function_node<int, int> function1(g, tbb::flow::unlimited,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (int item : new_work_items) {
+                        split.try_put(tuple_type{item, item});
+                    }
+                }
+                processed_items1.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::function_node<int, int> function2(g, tbb::flow::unlimited,
+            [&](int input) noexcept {
+                processed_items2.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(tbb::flow::output_port<0>(split), function1);
+        tbb::flow::make_edge(tbb::flow::output_port<1>(split), function2);
+
+        for (int i = 0; i < wait_message; ++i) {
+            split.try_put(tuple_type{i, i});
+        }
+
+        split.try_put_and_wait(tuple_type{wait_message, wait_message});
+
+        std::size_t check_index1 = 0;
+        std::size_t check_index2 = 0;
+
+        // Since split node broadcasts items to successors from last to first, start_work_items tasks and wait_message would be spawned
+        // in the following order {f2 - 1} - {f1 - 1} {f2 - 2} {f1 - 2} ... {f2 - 10}{f1 - 10}
+        // and processed in reversed order
+        // Hence {f1 - wait_message} task would be processed first and it would spawn tasks for new_work_items in the same order
+        // Since new_work_items tasks would processed first and {f2 - 10} would be still in queue
+        // it is expected that during the try_put_and_wait {f1 - 10} would be processed first, then new_work_items would be processed
+        // and only when {f2 - 10} would be taken and executed, try_put_and_wait would be exitted
+        // All of the other tasks for start_work_items would be processed during wait_for_all()
+        CHECK_MESSAGE(processed_items1[check_index1++] == wait_message, "Unexpected items processing");
+
+        for (std::size_t i = new_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items1[check_index1++] == new_work_items[i - 1], "Unexpected items processing");
+            CHECK_MESSAGE(processed_items2[check_index2++] == new_work_items[i - 1], "Unexpected items processing");
+        }
+
+        CHECK_MESSAGE(processed_items2[check_index2++] == wait_message, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        for (std::size_t i = start_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items1[check_index1++] == start_work_items[i - 1], "Unexpected items processing");
+            CHECK_MESSAGE(processed_items2[check_index2++] == start_work_items[i - 1], "Unexpected items processing");
+        }
+    });
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test output ports and message passing with different input tuples
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Tuple tests"){
@@ -446,3 +523,9 @@ TEST_CASE("Deduction guides"){
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test split_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_tagged_msg.cpp b/test/tbb/test_tagged_msg.cpp
index 656f0d3e89..520ecda9c2 100644
--- a/test/tbb/test_tagged_msg.cpp
+++ b/test/tbb/test_tagged_msg.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ typedef tbb::flow::tagged_msg<size_t, int, char, double, odd_array_type, odder_a
 
 // test base of tagged_msg
 void TestWrapper() {
-    using tbb::detail::d1::Wrapper;
+    using tbb::detail::d2::Wrapper;
     Wrapper<int> wi(42);
     Wrapper<int> wic(23);
 
diff --git a/test/tbb/test_task.cpp b/test/tbb/test_task.cpp
index 876e351006..6c2060a69a 100644
--- a/test/tbb/test_task.cpp
+++ b/test/tbb/test_task.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@
 
 #include <atomic>
 #include <thread>
-#include <thread>
+#include <deque>
 
 //! \file test_task.cpp
 //! \brief Test for [internal] functionality
@@ -840,3 +840,65 @@ TEST_CASE("Check correct arena destruction with enqueue") {
         tbb::finalize(handle, std::nothrow_t{});
     }
 }
+
+//! \brief \ref regression
+TEST_CASE("Try to force Leaked proxy observers warning") {
+    int num_threads = std::thread::hardware_concurrency() * 2;
+    tbb::global_control gc(tbb::global_control::max_allowed_parallelism, num_threads);
+    tbb::task_arena arena(num_threads, 0);
+    std::deque<tbb::task_scheduler_observer> observers;
+    for (int i = 0; i < 1000; ++i) {
+        observers.emplace_back(arena);
+    }
+
+    for (auto& observer : observers) {
+        observer.observe(true);
+    }
+
+    arena.enqueue([] {
+        tbb::parallel_for(0, 100000, [] (int) {
+            utils::doDummyWork(1000);
+        });
+    });
+}
+
+//! \brief \ref error_guessing
+TEST_CASE("Force thread limit on per-thread reference_vertex") {
+    int num_threads = std::thread::hardware_concurrency();
+    int num_groups = 1000;
+
+    // Force thread limit on per-thread reference_vertex
+    std::vector<tbb::task_group> groups(num_groups);
+    tbb::parallel_for(0, num_threads, [&] (int) {
+        std::vector<tbb::task_group> local_groups(num_groups);
+        for (int i = 0; i < num_groups; ++i) {
+            groups[i].run([] {});
+            local_groups[i].run([] {});
+            local_groups[i].wait();
+        }
+    }, tbb::static_partitioner{});
+
+    // Enforce extra reference on each task_group
+    std::deque<tbb::task_handle> handles{};
+    for (int i = 0; i < num_groups; ++i) {
+        handles.emplace_back(groups[i].defer([] {}));
+    }
+
+    // Check correctness of the execution
+    tbb::task_group group;
+
+    std::atomic<int> final_sum{};
+    for (int i = 0; i < num_groups; ++i) {
+        group.run([&] { ++final_sum; });
+    }
+    group.wait();
+    REQUIRE_MESSAGE(final_sum == num_groups, "Some tasks were not executed");
+
+    for (int i = 0; i < num_groups; ++i) {
+        groups[i].run(std::move(handles[i]));
+    }
+
+    for (int i = 0; i < num_groups; ++i) {
+        groups[i].wait();
+    }
+}
diff --git a/test/tbb/test_task_arena.cpp b/test/tbb/test_task_arena.cpp
index fd930f1995..6bd93d4c0e 100644
--- a/test/tbb/test_task_arena.cpp
+++ b/test/tbb/test_task_arena.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -1941,6 +1941,8 @@ TEST_CASE("Stress test with mixing functionality") {
     StressTestMixFunctionality();
 }
 
+// global_control::max_allowed_parallelism functionality is not covered by TCM
+#if !__TBB_TCM_TESTING_ENABLED
 //! \brief \ref stress
 TEST_CASE("Workers oversubscription") {
     std::size_t num_threads = utils::get_platform_max_threads();
@@ -1977,6 +1979,7 @@ TEST_CASE("Workers oversubscription") {
         );
     });
 }
+#endif
 
 #if TBB_USE_EXCEPTIONS
 //! The test for error in scheduling empty task_handle
diff --git a/test/tbb/test_task_group.cpp b/test/tbb/test_task_group.cpp
index d39b4fc703..5ad8355a15 100644
--- a/test/tbb/test_task_group.cpp
+++ b/test/tbb/test_task_group.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -397,7 +397,7 @@ class test_exception : public std::exception
 public:
     test_exception ( const char* descr ) : m_strDescription(descr) {}
 
-    const char* what() const throw() override { return m_strDescription; }
+    const char* what() const noexcept override { return m_strDescription; }
 };
 
 using TestException = test_exception;
@@ -780,8 +780,11 @@ TEST_CASE("Thread safety test for the task group") {
 TEST_CASE("Fibonacci test for the task group") {
     for (unsigned p=MinThread; p <= MaxThread; ++p) {
         tbb::global_control limit(tbb::global_control::max_allowed_parallelism, p);
+        tbb::task_arena a(p);
         g_MaxConcurrency = p;
-        RunFibonacciTests<tbb::task_group>();
+        a.execute([] {
+            RunFibonacciTests<tbb::task_group>();
+        });
     }
 }
 
@@ -838,7 +841,10 @@ TEST_CASE("Thread safety test for the isolated task group") {
         }
         tbb::global_control limit(tbb::global_control::max_allowed_parallelism, p);
         g_MaxConcurrency = p;
-        TestThreadSafety<tbb::isolated_task_group>();
+        tbb::task_arena a(p);
+        a.execute([] {
+            TestThreadSafety<tbb::isolated_task_group>();
+        });
     }
 }
 #endif
@@ -849,7 +855,10 @@ TEST_CASE("Fibonacci test for the isolated task group") {
     for (unsigned p=MinThread; p <= MaxThread; ++p) {
         tbb::global_control limit(tbb::global_control::max_allowed_parallelism, p);
         g_MaxConcurrency = p;
-        RunFibonacciTests<tbb::isolated_task_group>();
+        tbb::task_arena a(p);
+        a.execute([] {
+            RunFibonacciTests<tbb::isolated_task_group>();
+        });
     }
 }
 
@@ -859,7 +868,10 @@ TEST_CASE("Cancellation and exception test for the isolated task group") {
     for (unsigned p=MinThread; p <= MaxThread; ++p) {
         tbb::global_control limit(tbb::global_control::max_allowed_parallelism, p);
         g_MaxConcurrency = p;
-        RunCancellationAndExceptionHandlingTests<tbb::isolated_task_group>();
+        tbb::task_arena a(p);
+        a.execute([] {
+            RunCancellationAndExceptionHandlingTests<tbb::isolated_task_group>();
+        });
     }
 }
 
@@ -869,7 +881,10 @@ TEST_CASE("Constant functor test for the isolated task group") {
     for (unsigned p=MinThread; p <= MaxThread; ++p) {
         tbb::global_control limit(tbb::global_control::max_allowed_parallelism, p);
         g_MaxConcurrency = p;
-        TestConstantFunctorRequirement<tbb::isolated_task_group>();
+        tbb::task_arena a(p);
+        a.execute([] {
+            TestConstantFunctorRequirement<tbb::isolated_task_group>();
+        });
     }
 }
 
@@ -879,7 +894,10 @@ TEST_CASE("Move semantics test for the isolated task group") {
     for (unsigned p=MinThread; p <= MaxThread; ++p) {
         tbb::global_control limit(tbb::global_control::max_allowed_parallelism, p);
         g_MaxConcurrency = p;
-        TestMoveSemantics<tbb::isolated_task_group>();
+        tbb::task_arena a(p);
+        a.execute([] {
+            TestMoveSemantics<tbb::isolated_task_group>();
+        });
     }
 }
 
@@ -1204,4 +1222,3 @@ TEST_CASE("task_handle cannot be scheduled into other task_group of the same con
 }
 
 #endif // TBB_USE_EXCEPTIONS
-
diff --git a/test/tbb/test_write_once_node.cpp b/test/tbb/test_write_once_node.cpp
index 2bb16383f8..6fb716bab0 100644
--- a/test/tbb/test_write_once_node.cpp
+++ b/test/tbb/test_write_once_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -207,6 +207,135 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_try_put_and_wait() {
+    int wait_message = 0;
+    int occupy_concurrency_message = 1;
+    int new_message = 2;
+
+    // Test push
+    {
+        tbb::task_arena arena(1);
+
+        std::vector<int> processed_items;
+
+        arena.execute([&] {
+            tbb::flow::graph g;
+
+            tbb::flow::write_once_node<int> wo_buffer(g);
+            tbb::flow::function_node<int, int> function(g, tbb::flow::serial,
+                [&](int input) {
+                    if (input == wait_message) {
+                        wo_buffer.clear();
+                        wo_buffer.try_put(new_message);
+                    }
+                    processed_items.emplace_back(input);
+                    return 0;
+                });
+
+            tbb::flow::make_edge(wo_buffer, function);
+
+            wo_buffer.try_put_and_wait(wait_message);
+
+            std::size_t check_index = 0;
+            CHECK_MESSAGE(processed_items.size() == 1, "Only the wait_message should be processed");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Only the wait_message should be processed");
+
+            g.wait_for_all();
+
+            CHECK_MESSAGE(processed_items[check_index++] == new_message,
+                          "only the new_message should be processed in wait_for_all");
+            CHECK(check_index == processed_items.size());
+        });
+    }
+    // Test pull
+    {
+        std::vector<int> processed_items;
+        tbb::task_arena arena(1);
+
+        arena.execute([&] {
+            tbb::flow::graph g;
+
+            tbb::flow::write_once_node<int> wo_buffer(g);
+            tbb::flow::function_node<int, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+                [&](int input) {
+                    if (input == new_message || input == wait_message) {
+                        wo_buffer.clear();
+                    }
+
+                    if (input == wait_message) {
+                        wo_buffer.try_put(new_message);
+                    }
+                    processed_items.emplace_back(input);
+                    return 0;
+                });
+
+            tbb::flow::make_edge(wo_buffer, function);
+
+            function.try_put(occupy_concurrency_message);
+            wo_buffer.try_put_and_wait(wait_message);
+
+            std::size_t check_index = 0;
+            CHECK_MESSAGE(processed_items.size() == 2, "unexpected message processing for try_put_and_wait");
+            CHECK_MESSAGE(processed_items[check_index++] == occupy_concurrency_message,
+                          "occupy_concurrency_message should be processed first");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                          "wait_message was not processed");
+
+            g.wait_for_all();
+
+            CHECK_MESSAGE(processed_items[check_index++] == new_message,
+                          "only the new_message should be processed in wait_for_all");
+            CHECK(check_index == processed_items.size());
+        });
+    }
+    // Test reserve
+    {
+        std::vector<int> processed_items;
+        tbb::task_arena arena(1);
+
+        arena.execute([&] {
+            tbb::flow::graph g;
+
+            tbb::flow::write_once_node<int> wo_buffer(g);
+            tbb::flow::limiter_node<int, int> limiter(g, 1);
+            tbb::flow::function_node<int, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+                [&](int input) {
+                    if (input == new_message || input == wait_message) {
+                        wo_buffer.clear();
+                    }
+
+                    if (input == wait_message) {
+                        wo_buffer.try_put(new_message);
+                    }
+                    processed_items.emplace_back(input);
+                    limiter.decrementer().try_put(1);
+                    return 0;
+                });
+
+            tbb::flow::make_edge(wo_buffer, limiter);
+            tbb::flow::make_edge(limiter, function);
+
+            limiter.try_put(occupy_concurrency_message);
+            wo_buffer.try_put_and_wait(wait_message);
+
+            std::size_t check_index = 0;
+            CHECK_MESSAGE(processed_items.size() == 2, "unexpected message processing for try_put_and_wait");
+            CHECK_MESSAGE(processed_items[check_index++] == occupy_concurrency_message,
+                          "occupy_concurrency_message should be processed first");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                          "wait_message was not processed");
+
+            g.wait_for_all();
+
+            CHECK_MESSAGE(processed_items[check_index++] == new_message,
+                          "only the new_message should be processed in wait_for_all");
+            CHECK(check_index == processed_items.size());
+        });
+    }
+}
+#endif
+
 //! Test read-write properties
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Read-write tests"){
@@ -244,3 +373,10 @@ TEST_CASE("Deduction guides"){
     test_deduction_guides();
 }
 #endif
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test write_once_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif