diff --git a/.gitignore b/.gitignore
index dd373576f..4dbac8c29 100644
--- a/.gitignore
+++ b/.gitignore
@@ -183,3 +183,4 @@ wandb
 .vscode
 **/api_keys.json
 weights.csv
+past_websites.csv
diff --git a/neurons/miners/epistula_miner/miner.py b/neurons/miners/epistula_miner/miner.py
index e8f32fa7a..573dc30ea 100644
--- a/neurons/miners/epistula_miner/miner.py
+++ b/neurons/miners/epistula_miner/miner.py
@@ -45,7 +45,11 @@ def __init__(self):
             },
         )
         if SHOULD_SERVE_LLM:
-            self.llm = ReproducibleHF(model_id=LOCAL_MODEL_ID)
+            self.llm = ReproducibleHF(
+                model_id=LOCAL_MODEL_ID,
+                device=shared_settings.NEURON_DEVICE,
+                sampling_params=shared_settings.SAMPLING_PARAMS,
+            )
         else:
             self.llm = None
 
diff --git a/neurons/validator.py b/neurons/validator.py
index 174a17dbe..771ad8c24 100644
--- a/neurons/validator.py
+++ b/neurons/validator.py
@@ -72,7 +72,17 @@ async def spawn_loops(task_queue, scoring_queue, reward_events):
             logger.debug(f"Number of tasks in Scoring Queue: {len(scoring_queue)}")
             logger.debug(f"Number of tasks in Reward Events: {len(reward_events)}")
 
-    asyncio.run(spawn_loops(task_queue, scoring_queue, reward_events))
+    try:
+        asyncio.run(spawn_loops(task_queue, scoring_queue, reward_events))
+    except Exception as e:
+        logger.info(f"Terminating loop process: {e}")
+    finally:
+        logger.info("Cleaning up resources...")
+
+        # Ensure wandb is closed properly
+        if settings.shared_settings.WANDB_ON:
+            wandb.finish()
+            logger.info("WandB run finished.")
 
 
 def start_api(scoring_queue, reward_events):
@@ -150,19 +160,21 @@ async def main():
                         f"Metagraph hasn't been updated for {current_block - last_update_block} blocks. "
                         f"Staled block: {current_block}, Last update: {last_update_block}"
                     )
-                    sys.exit(1)
+                    break  # Exit the loop
                 step += 1
 
+        except KeyboardInterrupt:
+            logger.info("KeyboardInterrupt detected. Shutting down gracefully...")
         except Exception as e:
             logger.error(f"Main loop error: {e}")
             raise
         finally:
-            wandb.teardown()
             # Clean up processes
             for process in processes:
                 if process.is_alive():
                     process.terminate()
                     process.join()
+            sys.exit(1)
 
 
 # The main function parses the configuration and runs the validator.
diff --git a/poetry.lock b/poetry.lock
index 1f1adec01..119a60059 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -348,58 +348,37 @@ tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
 
 [[package]]
 name = "autoawq"
-version = "0.2.0"
+version = "0.2.8"
 description = "AutoAWQ implements the AWQ algorithm for 4-bit quantization with a 2x speedup during inference."
 optional = true
 python-versions = ">=3.8.0"
 groups = ["main"]
 markers = "extra == \"validator\""
-files = [
-    {file = "autoawq-0.2.0-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:4c9c4db6fbf23cd625a9cb5b5495777555659dc12aa7e0aba733f20c51f10005"},
-    {file = "autoawq-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cfefc8e8c4d92b9b78f2f1bff61d6bb413138d2ab221029587251344d65007c"},
-    {file = "autoawq-0.2.0-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:ee68699fec949c4440374b402558400efe83c359e7f85a5a7979608c5eec0da3"},
-    {file = "autoawq-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:4d6080539bb386a5754cc76b5081b112a93df1ee38f4c2f82e2773e9f098470b"},
-    {file = "autoawq-0.2.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:74d2c49780aaa7c7ba0fa4e1f196ac2dc4bdceba27e780115e7dfb32f1ba3c0a"},
-    {file = "autoawq-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:43651382592e348c8f44bdc6796b9fa6fc5bd398f58908410376f0b7aaa2b3b3"},
-    {file = "autoawq-0.2.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:a40c12fc4ddeabec6f04a2179e720e79563bfe29646ddf9c130bce0bcb51a760"},
-    {file = "autoawq-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:3c5dd45bcf23d8a0de2d79a04baf65fb2208249babeb729274c97df6218d48ae"},
-]
+files = []
+develop = false
 
 [package.dependencies]
 accelerate = "*"
-autoawq-kernels = "*"
-datasets = "*"
+datasets = ">=2.20"
+huggingface_hub = ">=0.26.5"
 tokenizers = ">=0.12.1"
-torch = ">=2.0.1"
-transformers = ">=4.35.0"
-typing-extensions = ">=4.8.0"
+torch = "*"
+transformers = ">=4.45.0"
+triton = "*"
+typing_extensions = ">=4.8.0"
 zstandard = "*"
 
 [package.extras]
+cpu = ["intel-extension-for-pytorch (>=2.4.0)"]
 dev = ["black", "griffe-typingdoc", "mkdocs-material", "mkdocstrings-python"]
-eval = ["evaluate", "lm-eval (>=0.4.0)", "protobuf", "scipy", "tabulate"]
-
-[[package]]
-name = "autoawq-kernels"
-version = "0.0.9"
-description = "AutoAWQ Kernels implements the AWQ kernels."
-optional = true
-python-versions = ">=3.8.0"
-groups = ["main"]
-markers = "extra == \"validator\""
-files = [
-    {file = "autoawq_kernels-0.0.9-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:ed8f4d744df21beae445efb1de54061bffc5fccbfefc8ae65c1dc10d08f90052"},
-    {file = "autoawq_kernels-0.0.9-cp310-cp310-win_amd64.whl", hash = "sha256:cd7d3db501068b3a12094a07921d985a57e640725cdda1252d4b135ed6aeaa65"},
-    {file = "autoawq_kernels-0.0.9-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:fe800a6538691afaa77abe7c8b2b0a121351843f048d54e11d617d604dcba48f"},
-    {file = "autoawq_kernels-0.0.9-cp311-cp311-win_amd64.whl", hash = "sha256:8c7f2404b3aa448ff77872dd6ba2963ce8b685d8aa73ef65fd1b8bc85d92b17d"},
-    {file = "autoawq_kernels-0.0.9-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:4c41a71af1d5a75e52c9833b9c48237b04d3b0eee26d712fc1b074af9135afc8"},
-    {file = "autoawq_kernels-0.0.9-cp312-cp312-win_amd64.whl", hash = "sha256:f259e7c60b11fa0689bb337dd4456319787256cbd2a8e4a491f01b51bb6c43d1"},
-    {file = "autoawq_kernels-0.0.9-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:b6baf039c22deb02f2ae194fdd77551b3c85c8f8a77b749f7caa17dacf986adb"},
-    {file = "autoawq_kernels-0.0.9-cp39-cp39-win_amd64.whl", hash = "sha256:6ad12dd68b0932182678f2f9fbee87452707b81f0e8dad242d23af018358f030"},
-]
+eval = ["evaluate", "lm_eval (==0.4.1)", "protobuf", "scipy", "tabulate"]
+kernels = ["autoawq-kernels", "flash-attn (>=2.2.0)"]
 
-[package.dependencies]
-torch = ">=2.5.1"
+[package.source]
+type = "git"
+url = "https://github.com/jiqing-feng/AutoAWQ.git"
+reference = "ae782a99df2f72a2c28764452844cb2d65bd8ffc"
+resolved_reference = "ae782a99df2f72a2c28764452844cb2d65bd8ffc"
 
 [[package]]
 name = "babel"
@@ -6085,15 +6064,15 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,
 
 [[package]]
 name = "transformers"
-version = "4.48.1"
+version = "4.47.1"
 description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
 optional = true
 python-versions = ">=3.9.0"
 groups = ["main"]
 markers = "extra == \"validator\""
 files = [
-    {file = "transformers-4.48.1-py3-none-any.whl", hash = "sha256:24be0564b0a36d9e433d9a65de248f1545b6f6edce1737669605eb6a8141bbbb"},
-    {file = "transformers-4.48.1.tar.gz", hash = "sha256:7c1931facc3ee8adcbf86fc7a87461d54c1e40eca3bb57fef1ee9f3ecd32187e"},
+    {file = "transformers-4.47.1-py3-none-any.whl", hash = "sha256:d2f5d19bb6283cd66c893ec7e6d931d6370bbf1cc93633326ff1f41a40046c9c"},
+    {file = "transformers-4.47.1.tar.gz", hash = "sha256:6c29c05a5f595e278481166539202bf8641281536df1c42357ee58a45d0a564a"},
 ]
 
 [package.dependencies]
@@ -6110,16 +6089,16 @@ tqdm = ">=4.27"
 
 [package.extras]
 accelerate = ["accelerate (>=0.26.0)"]
-agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=2.0)"]
-all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"]
+agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"]
+all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch", "torchaudio", "torchvision"]
 audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 benchmark = ["optimum-benchmark (>=0.3.0)"]
-codecarbon = ["codecarbon (>=2.8.1)"]
+codecarbon = ["codecarbon (==1.2.0)"]
 deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
-deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
-dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
-dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"]
-dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"]
+dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
 flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
 flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 ftfy = ["ftfy"]
@@ -6140,17 +6119,17 @@ serving = ["fastapi", "pydantic", "starlette", "uvicorn"]
 sigopt = ["sigopt"]
 sklearn = ["scikit-learn"]
 speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
-testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
 tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
 tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"]
 tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 tiktoken = ["blobfile", "tiktoken"]
 timm = ["timm (<=1.0.11)"]
 tokenizers = ["tokenizers (>=0.21,<0.22)"]
-torch = ["accelerate (>=0.26.0)", "torch (>=2.0)"]
+torch = ["accelerate (>=0.26.0)", "torch"]
 torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
 torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
-torchhub = ["filelock", "huggingface-hub (>=0.24.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"]
+torchhub = ["filelock", "huggingface-hub (>=0.24.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch", "tqdm (>=4.27)"]
 video = ["av (==9.2.0)"]
 vision = ["Pillow (>=10.0.1,<=15.0)"]
 
@@ -6161,7 +6140,7 @@ description = "A language and compiler for custom Deep Learning operations"
 optional = true
 python-versions = "*"
 groups = ["main"]
-markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"validator\""
+markers = "extra == \"validator\""
 files = [
     {file = "triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8"},
     {file = "triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c"},
@@ -6944,4 +6923,4 @@ validator = ["accelerate", "angle-emb", "autoawq", "bs4", "datasets", "duckduckg
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10 <3.11"
-content-hash = "57b3250544d8503dd5e3ad7b3842b18508f6c9357b3f1988f593ba5b64bcccca"
+content-hash = "5010601e1a0caa162dfd6fb96f143f5e49de524fcbaf1ca15c4073190bad14fa"
diff --git a/past_websites.csv b/prompting/Past Websites.csv
similarity index 100%
rename from past_websites.csv
rename to prompting/Past Websites.csv
diff --git a/prompting/llms/hf_llm.py b/prompting/llms/hf_llm.py
index 3cac61e41..a24b9fb8a 100644
--- a/prompting/llms/hf_llm.py
+++ b/prompting/llms/hf_llm.py
@@ -4,38 +4,38 @@
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel, pipeline
 
-from shared.settings import shared_settings
-
 
 class ReproducibleHF:
-    def __init__(self, model_id="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4", **kwargs):
-        """
-        Initialize Hugging Face model with reproducible settings and optimizations
-        """
-        # Create a random seed for reproducibility
-        # self.seed = random.randint(0, 1_000_000)
-        # self.set_random_seeds(self.seed)
+    def __init__(
+        self,
+        model_id: str = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4",
+        device: str = "cuda:0",
+        sampling_params: dict[str, str | float | int | bool] | None = None,
+    ):
+        """Deterministic HuggingFace model."""
+        self._device = device
+        self.sampling_params = {} if sampling_params is None else sampling_params
         self.model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
             model_id,
             torch_dtype=torch.float16,
             low_cpu_mem_usage=True,
-            device_map="cuda:0",
+            device_map=self._device,
         )
 
         self.tokenizer = AutoTokenizer.from_pretrained(model_id)
         self.valid_generation_params = set(
             AutoModelForCausalLM.from_pretrained(model_id).generation_config.to_dict().keys()
         )
-
         self.llm = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)
 
-        self.sampling_params = shared_settings.SAMPLING_PARAMS
-
     @torch.inference_mode()
-    def generate(self, messages: list[str] | list[dict], sampling_params=None, seed=None):
-        """
-        Generate text with optimized performance
-        """
+    def generate(
+        self,
+        messages: list[str] | list[dict[str, str]],
+        sampling_params: dict[str, str | float | int | bool] | None = None,
+        seed: int | None = None,
+    ) -> str:
+        """Generate text with optimized performance."""
         self.set_random_seeds(seed)
 
         inputs = self.tokenizer.apply_chat_template(
@@ -44,14 +44,13 @@ def generate(self, messages: list[str] | list[dict], sampling_params=None, seed=
             add_generation_prompt=True,
             return_tensors="pt",
             return_dict=True,
-        ).to(shared_settings.NEURON_DEVICE)
+        ).to(self._device)
 
         params = sampling_params if sampling_params else self.sampling_params
         filtered_params = {k: v for k, v in params.items() if k in self.valid_generation_params}
 
-        # Generate with optimized settings
         outputs = self.model.generate(
-            **inputs.to(shared_settings.NEURON_DEVICE),
+            **inputs,
             **filtered_params,
             eos_token_id=self.tokenizer.eos_token_id,
         )
@@ -61,21 +60,10 @@ def generate(self, messages: list[str] | list[dict], sampling_params=None, seed=
             skip_special_tokens=True,
         )[0]
 
-        # logger.debug(
-        #     f"""{self.__class__.__name__} queried:
-        #     prompt: {messages}\n
-        #     responses: {results}\n
-        #     sampling params: {params}\n
-        #     seed: {seed}
-        #     """
-        # )
-
         return results if len(results) > 1 else results[0]
 
-    def set_random_seeds(self, seed=42):
-        """
-        Set random seeds for reproducibility across all relevant libraries
-        """
+    def set_random_seeds(self, seed: int | None = 42):
+        """Set random seeds for reproducibility across all relevant libraries."""
         if seed is not None:
             random.seed(seed)
             np.random.seed(seed)
diff --git a/prompting/llms/model_manager.py b/prompting/llms/model_manager.py
index 35171d453..0e2e10b00 100644
--- a/prompting/llms/model_manager.py
+++ b/prompting/llms/model_manager.py
@@ -65,9 +65,9 @@ def load_model(self, model_config: ModelConfig, force: bool = True):
             GPUInfo.log_gpu_info()
 
             model = ReproducibleHF(
-                model=model_config.llm_model_id,
-                gpu_memory_utilization=model_config.min_ram / GPUInfo.free_memory,
-                max_model_len=settings.shared_settings.LLM_MAX_MODEL_LEN,
+                model_id=model_config.llm_model_id,
+                device=settings.shared_settings.NEURON_DEVICE,
+                sampling_params=settings.shared_settings.SAMPLING_PARAMS,
             )
 
             self.active_models[model_config] = model
diff --git a/prompting/rewards/date.py b/prompting/rewards/date.py
index 4cf51d5a5..b82f78e4f 100644
--- a/prompting/rewards/date.py
+++ b/prompting/rewards/date.py
@@ -89,7 +89,7 @@ def date_score(self, reference: str, completion: str) -> float:
             score = 0
         return score
 
-    def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
+    async def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
         """Compute difference scores given a completion and reference pair.
 
         Args:
diff --git a/prompting/rewards/exact_match.py b/prompting/rewards/exact_match.py
index 3b192ea92..d9570e506 100644
--- a/prompting/rewards/exact_match.py
+++ b/prompting/rewards/exact_match.py
@@ -28,7 +28,7 @@ def normalize_timing(timing: float, timings: float) -> float:
 
 
 class ExactMatchRewardModel(BaseRewardModel):
-    def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
+    async def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
         """
         Calculates rewards based on an exact match of the response with the reference string.
 
diff --git a/prompting/rewards/float_diff.py b/prompting/rewards/float_diff.py
index 3cdbe5930..2952aa750 100644
--- a/prompting/rewards/float_diff.py
+++ b/prompting/rewards/float_diff.py
@@ -55,7 +55,7 @@ def math_score(reference: str, completion: str) -> float:
         except Exception:
             return 0.0
 
-    def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
+    async def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
         """Compute difference scores given a completion and reference pair."""
         rewards = []
         timings = []
diff --git a/prompting/rewards/inference_reward_model.py b/prompting/rewards/inference_reward_model.py
index d6ccbfb99..fa4df4804 100644
--- a/prompting/rewards/inference_reward_model.py
+++ b/prompting/rewards/inference_reward_model.py
@@ -5,7 +5,7 @@
 
 
 class InferenceRewardModel(BaseRewardModel):
-    def reward(
+    async def reward(
         self,
         reference: str,
         response_event: DendriteResponseEvent,
@@ -14,5 +14,5 @@ def reward(
     ) -> BatchRewardOutput:
         """Gives an exact reward of 1 if the response matches the reference, 0 otherwise"""
         if model_id:
-            return ExactMatchRewardModel().reward(reference, response_event)
-        return RelevanceRewardModel().reward(reference, response_event)
+            return await ExactMatchRewardModel().reward(reference, response_event)
+        return await RelevanceRewardModel().reward(reference, response_event)
diff --git a/prompting/rewards/multi_choice.py b/prompting/rewards/multi_choice.py
index d4ee2ee39..e2cdc1c2e 100644
--- a/prompting/rewards/multi_choice.py
+++ b/prompting/rewards/multi_choice.py
@@ -29,8 +29,8 @@ def safe_load_json(json_string: str) -> dict[str, float]:
         cleaned_json_string = re.sub(r'"\s*\n\s*"', r'""', cleaned_json_string)
         try:
             return {k.upper(): v for k, v in json.loads(cleaned_json_string).items()}
-        except json.JSONDecodeError as e:
-            raise ValueError(f"Invalid JSON data: {e}")
+        except Exception:
+            return None
 
     def process_predictions(self, predictions: dict[str, float]) -> dict[str, float]:
         if not all(isinstance(value, (int, float)) for value in predictions.values()):
@@ -56,12 +56,14 @@ def letter_reward(self, reference: str, completion: str) -> float:
     def logit_reward(self, reference: str, completion: str) -> float:
         try:
             loaded_json = self.safe_load_json(completion)
+            if not loaded_json:
+                return None
             valid_choices = self.process_predictions(loaded_json)
             return valid_choices.get(reference.upper(), 0.0)
         except ValueError:
             return None
 
-    def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
+    async def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
         rewards = []
         timings = []
 
diff --git a/prompting/rewards/penalty.py b/prompting/rewards/penalty.py
index 51c21234f..3c58e7d71 100644
--- a/prompting/rewards/penalty.py
+++ b/prompting/rewards/penalty.py
@@ -13,7 +13,7 @@ class PenaltyModel(BaseRewardModel):
     def name(self) -> str:
         return "penalty"
 
-    def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
+    async def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
         """Penalises miner if they do not respond."""
         rewards = []
         timings = []
diff --git a/prompting/rewards/relevance.py b/prompting/rewards/relevance.py
index 9288a007f..92db3b486 100644
--- a/prompting/rewards/relevance.py
+++ b/prompting/rewards/relevance.py
@@ -28,7 +28,7 @@ def init_model(self) -> "RelevanceRewardModel":
         self.embedding_model = MODEL
         return self
 
-    def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
+    async def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
         """Calculate the cosine similarity between sentence embeddings of the reference and completions.
 
         We subtract a baseline score which is what an empty string would get (a failed completion).
diff --git a/prompting/rewards/reward.py b/prompting/rewards/reward.py
index cd886db5f..4e4102e68 100644
--- a/prompting/rewards/reward.py
+++ b/prompting/rewards/reward.py
@@ -69,10 +69,10 @@ class BaseRewardModel(ABC, BaseModel):
     weight: float = 1.0
 
     @abstractmethod
-    def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
+    async def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
         raise NotImplementedError("You must implement the reward method")
 
-    def apply(
+    async def apply(
         self,
         response_event: DendriteResponseEvent,
         reference: str | None = None,
@@ -83,7 +83,7 @@ def apply(
     ) -> WeightedRewardEvent:
         t0 = time.time()
         comparator = reference if reward_type == "reward" else challenge
-        batch_rewards_output: BatchRewardOutput = self.reward(comparator, response_event, task=task, **kwargs)
+        batch_rewards_output: BatchRewardOutput = await self.reward(comparator, response_event, task=task, **kwargs)
         batch_rewards_time = time.time() - t0
 
         return WeightedRewardEvent(
@@ -136,7 +136,7 @@ def final_rewards(cls, reward_events: list[WeightedRewardEvent]) -> list[float]:
         return cls.sum_rewards(reward_events)
 
     @classmethod
-    def apply(
+    async def apply(
         cls,
         response_event: DendriteResponseEvent,
         reference: str,
@@ -147,7 +147,7 @@ def apply(
         reward_events = []
         for weighted_reward in cls.reward_definitions:
             reward_events.append(
-                weighted_reward.apply(
+                await weighted_reward.apply(
                     reference=reference,
                     response_event=response_event,
                     challenge=challenge,
diff --git a/prompting/rewards/rouge.py b/prompting/rewards/rouge.py
index 07ab2315b..7430d1109 100644
--- a/prompting/rewards/rouge.py
+++ b/prompting/rewards/rouge.py
@@ -22,7 +22,7 @@ def rouge_score(self, reference, completion):
             return 0.0
         return self.rouge.get_scores(reference, completion, avg=self.avg)[0][self.ngram][self.metric]
 
-    def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
+    async def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput:
         """Compute ROUGE scores given a completion and reference pair."""
         rewards = []
         timings = []
diff --git a/prompting/rewards/scoring.py b/prompting/rewards/scoring.py
index d8d590944..806aa4110 100644
--- a/prompting/rewards/scoring.py
+++ b/prompting/rewards/scoring.py
@@ -21,7 +21,7 @@ class TaskScorer(AsyncLoopRunner):
 
     is_running: bool = False
     thread: threading.Thread = None
-    interval: int = 10
+    interval: int = 0
     scoring_queue: list | None = None
     reward_events: list | None = None
 
@@ -76,7 +76,7 @@ async def run_step(self) -> RewardLoggingEvent:
 
         # and there we then calculate the reward
         reward_pipeline = TaskRegistry.get_task_reward(scoring_config.task)
-        reward_events = reward_pipeline.apply(
+        reward_events = await reward_pipeline.apply(
             response_event=scoring_config.response,
             challenge=scoring_config.task.query,
             reference=scoring_config.task.reference,
diff --git a/prompting/rewards/streaming.py b/prompting/rewards/streaming.py
index 0bf1696cb..defd2fc6e 100644
--- a/prompting/rewards/streaming.py
+++ b/prompting/rewards/streaming.py
@@ -17,8 +17,7 @@ def __init__(self, max_tokens_per_chunk: int, **kwargs):
         super().__init__()
         self.max_tokens_per_chunk = max_tokens_per_chunk
 
-    def reward(self, _: str, response_event: DendriteResponseEvent) -> BatchRewardOutput:
-        """Compute difference scores given a completion and reference pair."""
+    async def reward(self, reference: str, response_event: DendriteResponseEvent) -> BatchRewardOutput:
         """Compute difference scores given a completion and reference pair."""
         rewards = []
         timings = []
diff --git a/prompting/rewards/web_retrieval.py b/prompting/rewards/web_retrieval.py
index 21dcec8ff..2ed149ff8 100644
--- a/prompting/rewards/web_retrieval.py
+++ b/prompting/rewards/web_retrieval.py
@@ -1,8 +1,10 @@
 """Expected miner's response is a JSON object with the following keys: url, content, relevant for each website."""
 
+import asyncio
 import json
 import os
 from collections import defaultdict
+from functools import lru_cache
 from urllib.parse import urlparse
 
 import numpy as np
@@ -54,17 +56,17 @@
     TOP_DOMAINS = set(top_domains_df["Domain"].str.lower().values)
 
     # Load past websites
-    if os.path.exists(PAST_WEBSITES_FILE):
+    if os.path.exists(PAST_WEBSITES_FILE) and os.path.getsize(PAST_WEBSITES_FILE) > 0:
         past_websites_df = pd.read_csv(PAST_WEBSITES_FILE)
         past_websites = defaultdict(list)
         # Group by uid and take only the last N_PAST_URLS entries
         for uid, group in past_websites_df.groupby("uid"):
             past_websites[uid] = group["domain"].tolist()[-N_PAST_URLS:]
     else:
-        logger.warning(f"Past websites file {PAST_WEBSITES_FILE} does not exist, creating new dictionary")
+        logger.warning(f"Past websites file {PAST_WEBSITES_FILE} does not exist or empty, creating new dictionary")
         past_websites = defaultdict(list)
 except Exception as e:
-    logger.exception(f"Failed to load domains data: {e}")
+    logger.error(f"Failed to load domains data: {e}")
     TOP_DOMAINS = set()
     past_websites = defaultdict(list)
 
@@ -83,13 +85,18 @@ class WebsiteResult(BaseModel):
 
 
 class WebRetrievalRewardModel(RelevanceRewardModel):
-    def _cosine_similarity(self, content1: str, content2: str) -> float:
+    def __hash__(self):
+        # Use the id of the object as its hash
+        return hash(self.model_dump_json)
+
+    @lru_cache(maxsize=1000)
+    async def _cosine_similarity(self, content1: str, content2: str) -> float:
         """Calculate the cosine similarity between sentence embeddings of the reference and completions."""
         reference_emb_flatten = self.embedding_model.encode(content1, to_numpy=True).flatten()
         response_emb_flatten = self.embedding_model.encode(content2, to_numpy=True).flatten()
         return 1.0 - float(spatial.distance.cosine(reference_emb_flatten, response_emb_flatten))
 
-    def score_website_result(
+    async def score_website_result(
         self, dataset_entry: DDGDatasetEntry, response_url: str, response_content: str, response_relevant: str, uid: str
     ) -> float:
         if not response_url or not response_content or not response_relevant:
@@ -128,14 +135,10 @@ def score_website_result(
             if domain in TOP_DOMAINS:
                 # if the domain is in the top 100k, we allow 10 occurrences in the last 200 URLs before penalising
                 discount_factor *= 1.0 / (max(1, domain_count - 10))
-                logger.debug(f"Domain {domain} is in top 100k domains, not applying penalty")
             else:
                 # Count how many times this domain has been used by this miner
                 discount_factor *= 1.0 / max(1, domain_count)
-                if domain in past_websites[uid]:
-                    logger.debug(
-                        f"Already used domain {domain} for this UID, applying ( discount ) factor {discount_factor}"
-                    )
+
             _append_to_past_websites(uid, domain)
 
             # Content scraped from the URL provided in the completion.
@@ -158,9 +161,12 @@ def score_website_result(
             if response_relevant not in response_content:
                 return 0
 
-            return self._cosine_similarity(content1=dataset_entry.query, content2=response_relevant) * discount_factor
+            return (
+                await self._cosine_similarity(content1=dataset_entry.query, content2=response_relevant)
+                * discount_factor
+            )
 
-    def score_miner_response(
+    async def score_miner_response(
         self, dataset_entry: DDGDatasetEntry, completion: str, task: BaseTextTask | None = None, uid: str | None = None
     ) -> float:
         scores = []
@@ -170,8 +176,11 @@ def score_miner_response(
             # logger.warning("Miner returned multiple websites with the same URL")
             return 0
 
-        for website in miner_websites:
-            scores.append(self.score_website_result(dataset_entry, website.url, website.content, website.relevant, uid))
+        tasks = [
+            self.score_website_result(dataset_entry, website.url, website.content, website.relevant, uid)
+            for website in miner_websites
+        ]
+        scores = await asyncio.gather(*tasks)
 
         if scores:
             weights = np.arange(len(scores), 0, -1)
@@ -179,7 +188,7 @@ def score_miner_response(
         return 0
 
     # TODO: Change base class reference type to Reference pydantic model, in order to store additional data.
-    def reward(
+    async def reward(
         self, reference: str, response_event: DendriteResponseEvent, task: BaseTextTask | None = None, **kwargs
     ) -> BatchRewardOutput:
         """Score response website content and URL based on the similarity to the search term and reference content."""
@@ -194,7 +203,7 @@ def reward(
             )
 
         for completion, uid in zip(response_event.completions, response_event.uids):
-            rewards.append(self.score_miner_response(dataset_entry, completion, task=task, uid=uid))
+            rewards.append(await self.score_miner_response(dataset_entry, completion, task=task, uid=uid))
             timings.append(0)
 
         logger.debug(f"REWARDWEBRETRIEVAL: {rewards}")
@@ -217,6 +226,8 @@ def _parse_response(completion: str) -> tuple[str | None, ...]:
             if not isinstance(data, list) and isinstance(data, dict):
                 data = [data]
             for website in data:
+                if not isinstance(website, dict):
+                    continue
                 response_url = website.get("url")
                 response_content = website.get("content")
                 response_relevant = website.get("relevant")
diff --git a/prompting/tasks/inference.py b/prompting/tasks/inference.py
index 49a984541..1170ef2a3 100644
--- a/prompting/tasks/inference.py
+++ b/prompting/tasks/inference.py
@@ -47,7 +47,7 @@ class InferenceTask(BaseTextTask):
     reference: str | None = None
     system_prompt: str | None = None
     llm_model: ModelConfig | None = None
-    llm_model_id: ModelConfig | None = random.choice(ModelZoo.models_configs).llm_model_id
+    llm_model_id: str | None = random.choice(ModelZoo.models_configs).llm_model_id
     seed: int = Field(default_factory=lambda: random.randint(0, 1_000_000), allow_mutation=False)
     sampling_params: dict[str, float] = shared_settings.SAMPLING_PARAMS.copy()
     messages: list[dict] | None = None
diff --git a/prompting/tasks/qa.py b/prompting/tasks/qa.py
index 668da6187..d9f4285e4 100644
--- a/prompting/tasks/qa.py
+++ b/prompting/tasks/qa.py
@@ -1,5 +1,6 @@
 from typing import ClassVar
 
+from prompting.datasets.random_website import DDGDatasetEntry
 from prompting.rewards.relevance import RelevanceRewardModel
 from prompting.rewards.reward import BaseRewardConfig, BaseRewardModel
 from prompting.rewards.rouge import RougeRewardModel
@@ -80,12 +81,12 @@ class WebQuestionAnsweringTask(BaseTextTask):
     query: str | None = None
     reference: str | None = None
 
-    def make_query(self, dataset_entry: Context):
+    def make_query(self, dataset_entry: DDGDatasetEntry):
         query_prompt = QUERY_PROMPT_TEMPLATE.format(context=dataset_entry.website_content)
         self.query = self.generate_query(messages=[query_prompt])
         return self.query
 
-    async def make_reference(self, dataset_entry: Context):
+    async def make_reference(self, dataset_entry: DDGDatasetEntry):
         reference_prompt = REFERENCE_PROMPT_TEMPLATE.format(context=dataset_entry.website_content, question=self.query)
         self.reference = self.generate_reference(messages=[{"role": "user", "content": reference_prompt}])
         return self.reference
diff --git a/prompting/tasks/task_creation.py b/prompting/tasks/task_creation.py
index 3de8dcbf9..15cc03256 100644
--- a/prompting/tasks/task_creation.py
+++ b/prompting/tasks/task_creation.py
@@ -19,7 +19,7 @@
 class TaskLoop(AsyncLoopRunner):
     is_running: bool = False
     thread: threading.Thread = None
-    interval: int = 10
+    interval: int = 0
     task_queue: list | None = []
     scoring_queue: list | None = []
     model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -31,10 +31,11 @@ async def start(self, task_queue, scoring_queue):
 
     async def run_step(self):
         if len(self.task_queue) > shared_settings.TASK_QUEUE_LENGTH_THRESHOLD:
+            await asyncio.sleep(10)
             return None
         if len(self.scoring_queue) > shared_settings.SCORING_QUEUE_LENGTH_THRESHOLD:
+            await asyncio.sleep(10)
             return None
-        await asyncio.sleep(0.1)
         try:
             task = None
             # Getting task and dataset
@@ -47,7 +48,6 @@ async def run_step(self):
                     logger.exception(ex)
                 await asyncio.sleep(0.1)
 
-            await asyncio.sleep(0.1)
             if len(miner_availabilities.get_available_miners(task=task, model=task.llm_model_id)) == 0:
                 logger.debug(
                     f"No available miners for Task: {task.__class__.__name__} and Model ID: {task.llm_model_id}. Skipping step."
@@ -69,7 +69,6 @@ async def run_step(self):
         except Exception as ex:
             logger.exception(ex)
             return None
-        await asyncio.sleep(0.01)
 
 
 task_loop = TaskLoop()
diff --git a/prompting/tasks/task_sending.py b/prompting/tasks/task_sending.py
index c7e13e918..c21bd204a 100644
--- a/prompting/tasks/task_sending.py
+++ b/prompting/tasks/task_sending.py
@@ -57,7 +57,7 @@ async def collect_responses(task: BaseTextTask) -> DendriteResponseEvent | None:
     if isinstance(task, WebRetrievalTask):
         body["target_results"] = task.target_results
     body["timeout"] = task.timeout
-    stream_results = await query_miners(uids, body)
+    stream_results = await query_miners(uids, body, timeout_seconds=task.timeout)
     # log_stream_results(stream_results)
 
     response_event = DendriteResponseEvent(
diff --git a/prompting/tasks/web_retrieval.py b/prompting/tasks/web_retrieval.py
index b23813cf6..f0cb10040 100644
--- a/prompting/tasks/web_retrieval.py
+++ b/prompting/tasks/web_retrieval.py
@@ -39,7 +39,7 @@ class WebRetrievalTask(BaseTextTask):
     augmentation_system_prompt: ClassVar[str] = ""
     query_system_prompt: ClassVar[Optional[str]] = QUERY_SYSTEM_PROMPT
     target_results: int = Field(default_factory=lambda: random.randint(1, 10))
-    timeout: int = Field(default_factory=lambda: random.randint(3, 20))
+    timeout: int = Field(default_factory=lambda: random.randint(5, 20))
 
     def make_query(self, dataset_entry: DDGDatasetEntry) -> str:
         self.query = self.generate_query(
diff --git a/pyproject.toml b/pyproject.toml
index 6e2ec2827..4f92dca97 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -145,10 +145,11 @@ isort = "^5.13.2"
 tiktoken = "^0.8.0"
 pillow = "^11.0.0"
 torch = { version = "2.5.1", optional = true }
-transformers = { version = ">=4.46.3", optional = true }
+# TODO: Switch to original repo when this PR to fix setup gets merged: https://github.com/casper-hansen/AutoAWQ/pull/715
+autoawq = { git = "https://github.com/jiqing-feng/AutoAWQ.git", rev = "ae782a99df2f72a2c28764452844cb2d65bd8ffc", optional = true }
+transformers = { version = "<=4.47.1", optional = true }
 torchvision = { version = ">=0.20.1", optional = true }
 accelerate = { version = ">=1.1.1", optional = true }
-autoawq = { version = "0.2.0", optional = true }
 angle-emb = { version = "0.4.3", optional = true }
 numpy = { version = ">=2.0.1", optional = true }
 rouge = { version = ">=1.0.1", optional = true }
diff --git a/scripts/install.sh b/scripts/install.sh
index 2904f5d3f..e5fe2c78f 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -8,12 +8,6 @@ poetry config virtualenvs.in-project true
 
 # Install the project dependencies
 poetry install --extras "validator"
-
-# Build AutoAWQ==0.2.8 from source
-if [ -d AutoAWQ ]; then rm -rf AutoAWQ; fi
-git clone https://github.com/casper-hansen/AutoAWQ.git
-cd AutoAWQ && git checkout 16335d087dd4f9cdc8933dd7a5681e4bf88311b6 && poetry run pip install -e . && cd ..
-
 poetry run pip install flash-attn --no-build-isolation
 
 # Check if jq is installed and install it if not
diff --git a/scripts/systemd/api_start.sh b/scripts/systemd/api_start.sh
new file mode 100644
index 000000000..ea6df04f3
--- /dev/null
+++ b/scripts/systemd/api_start.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Create and launch API systemd service.
+set -e
+
+# Check if systemd is running.
+if [ ! -d /run/systemd/system ]; then
+    echo "Error: systemd does not appear to be running. Exiting."
+    exit 1
+fi
+
+# Adjust project dir as needed.
+WORKDIR=/root/prompting
+
+SERVICE_FILE="/etc/systemd/system/sn1api.service"
+
+# Create (or update) the systemd service file.
+sudo tee "${SERVICE_FILE}" > /dev/null <<EOF
+[Unit]
+Description=SN1 API Service
+After=network.target
+
+[Service]
+Type=simple
+# Set the working directory so that poetry finds the correct pyproject.toml
+WorkingDirectory=${WORKDIR}
+# Use Poetry to run the API script
+ExecStart=python3.10 -m poetry run uvicorn validator_api.api:app --host 0.0.0.0 --port 8005 --workers 8
+Restart=always
+User=root
+Environment=PYTHONUNBUFFERED=1
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+# Reload systemd configuration.
+sudo systemctl daemon-reload
+
+# Enable the service to start at boot.
+sudo systemctl enable sn1api.service
+
+# Restart the service if already running, or start it if not.
+sudo systemctl restart sn1api.service
+
+echo "SN1 API service (sn1api) has been started/restarted."
+echo "Attaching to API logs. Press Ctrl+C to exit."
+sudo journalctl -f -u sn1api.service
diff --git a/scripts/systemd/api_stop.sh b/scripts/systemd/api_stop.sh
new file mode 100644
index 000000000..f2a2cb2dc
--- /dev/null
+++ b/scripts/systemd/api_stop.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Stop and delete SN1 API systemd service so it does not start at boot.
+set -e
+
+SERVICE_FILE="/etc/systemd/system/sn1api.service"
+
+# Check if the service file exists.
+if [ ! -f "${SERVICE_FILE}" ]; then
+    echo "SN1 API service file not found. Nothing to do."
+    exit 0
+fi
+
+# Stop the service if it's running.
+sudo systemctl stop sn1api.service || true
+
+# Disable the service from starting at boot.
+sudo systemctl disable sn1api.service || true
+
+# Remove the service file.
+sudo rm -f "${SERVICE_FILE}"
+
+# Reload systemd configuration.
+sudo systemctl daemon-reload
+
+echo "SN1 API service has been stopped and removed."
diff --git a/shared/base.py b/shared/base.py
index 91ae84525..5f08d1272 100644
--- a/shared/base.py
+++ b/shared/base.py
@@ -18,10 +18,10 @@ def __hash__(self) -> int:
 
 
 class ChatEntry(DatasetEntry):
-    messages: list[dict]
+    messages: list[dict[str, str]]
     organic: bool
     source: str | None = None
-    query: str | None = None
+    query: dict[str, str] | None = None
 
     @model_validator(mode="after")
     def check_query(self) -> "ChatEntry":
diff --git a/shared/epistula.py b/shared/epistula.py
index fba14ece4..b2d43417b 100644
--- a/shared/epistula.py
+++ b/shared/epistula.py
@@ -121,20 +121,20 @@ async def query_miners(
     try:
         tasks = []
         for uid in uids:
-            tasks.append(
-                asyncio.create_task(
-                    make_openai_query(shared_settings.METAGRAPH, shared_settings.WALLET, timeout_seconds, body, uid)
+            try:
+                response = asyncio.wait_for(
+                    asyncio.create_task(
+                        make_openai_query(shared_settings.METAGRAPH, shared_settings.WALLET, timeout_seconds, body, uid)
+                    ),
+                    timeout=timeout_seconds,
                 )
-            )
-        responses = await asyncio.gather(*tasks, return_exceptions=True)
+            except asyncio.TimeoutError:
+                logger.error(f"Timeout exceeded while querying miner {uid}")
+                response = Exception(f"Timeout exceeded while querying miner {uid}")
+            tasks.append(response)
 
-        # Show exceptions from responses
-        exceptions = [resp for resp in responses if isinstance(resp, Exception)]
-        if exceptions:
-            for exc in exceptions:
-                logger.debug(f"Error in make_openai_query: {exc}")
+        responses = await asyncio.gather(*tasks, return_exceptions=True)
 
-        # 'responses' is a list of SynapseStreamResult objects
         results = []
         for response, uid in zip(responses, uids):
             if isinstance(response, Exception):
diff --git a/shared/logging.py b/shared/logging.py
index 8787d8526..b03aa0808 100644
--- a/shared/logging.py
+++ b/shared/logging.py
@@ -17,7 +17,7 @@
 from shared.dendrite import DendriteResponseEvent
 
 # TODO: Get rid of global variables.
-WANDB: Run
+WANDB: Run | None = None
 
 
 @dataclass
@@ -79,7 +79,7 @@ def should_reinit_wandb():
 
 def init_wandb(reinit=False, neuron: Literal["validator", "miner", "api"] = "validator", custom_tags: list = []):
     """Starts a new wandb run."""
-    global WANDB
+    # global WANDB
     tags = [
         f"Wallet: {settings.shared_settings.WALLET.hotkey.ss58_address}",
         f"Version: {prompting.__version__}",
@@ -111,27 +111,25 @@ def init_wandb(reinit=False, neuron: Literal["validator", "miner", "api"] = "val
     wandb_run_name = f"{neuron}{settings.shared_settings.UID}-{datetime.now().strftime('%Y%m%d_%H%M%S')}"
 
     # Initialize the wandb run with the custom name.
-    WANDB = wandb.init(
+    wandb_obj = wandb.init(
         reinit=reinit,
         name=wandb_run_name,
         project=settings.shared_settings.WANDB_PROJECT_NAME,
         entity=settings.shared_settings.WANDB_ENTITY,
         mode="offline" if settings.shared_settings.WANDB_OFFLINE else "online",
-        dir=settings.shared_settings.SAVE_PATH,
         tags=tags,
         notes=settings.shared_settings.WANDB_NOTES,
         config=wandb_config,
     )
-    signature = settings.shared_settings.WALLET.hotkey.sign(WANDB.id.encode()).hex()
+    signature = settings.shared_settings.WALLET.hotkey.sign(wandb_obj.id.encode()).hex()
     wandb_config["SIGNATURE"] = signature
-    WANDB.config.update(wandb_config)
-    logger.success(f"Started a new wandb run <blue> {WANDB.name} </blue>")
+    wandb_obj.config.update(wandb_config)
+    logger.success(f"Started a new wandb run <blue> {wandb_obj.name} </blue>")
 
 
 def reinit_wandb():
     """Reinitializes wandb, rolling over the run."""
-    global WANDB
-    WANDB.finish()
+    wandb.finish()
     init_wandb(reinit=True)
 
 
diff --git a/shared/loop_runner.py b/shared/loop_runner.py
index f6f0bebc2..617b39c3a 100644
--- a/shared/loop_runner.py
+++ b/shared/loop_runner.py
@@ -57,25 +57,23 @@ def next_sync_point(self, current_time):
     async def wait_for_next_execution(self, last_run_time):
         """Wait until the next execution time, either synced or based on last run."""
         current_time = await self.get_time()
+        if last_run_time.tzinfo is None:
+            last_run_time = last_run_time.replace(tzinfo=current_time.tzinfo)
         if self.sync:
             next_run = self.next_sync_point(current_time)
         else:
             next_run = last_run_time + timedelta(seconds=self.interval)
 
         wait_time = (next_run - current_time).total_seconds()
-        if wait_time > 0:
-            # logger.debug(
-            #     f"{self.name}: Waiting for {wait_time:.2f} seconds until next {'sync point' if self.sync else 'execution'}"
-            # )
-            await asyncio.sleep(wait_time)
+        await asyncio.sleep(max(0.01, wait_time))
         return next_run
 
     async def run_loop(self):
         """Run the loop periodically, optionally synchronizing across all instances."""
 
         last_run_time = await self.get_time()
-        try:
-            while self.running:
+        while self.running:
+            try:
                 with profiler.measure(self.name):
                     next_run = await self.wait_for_next_execution(last_run_time)
                     try:
@@ -84,12 +82,12 @@ async def run_loop(self):
                     except Exception as ex:
                         logger.exception(f"Error in loop iteration: {ex}")
                     last_run_time = next_run
-        except asyncio.CancelledError:
-            logger.info("Loop was stopped.")
-        except Exception as e:
-            logger.error(f"Fatal error in loop: {e}")
-        finally:
-            self.running = False
+            except asyncio.CancelledError:
+                logger.info("Loop was stopped.")
+                self.running = False
+            except Exception as e:
+                logger.error(f"Fatal error in loop: {e}")
+        self.running = False
 
     async def start(self, name: str | None = None):
         """Start the loop."""
diff --git a/shared/misc.py b/shared/misc.py
index 621d470f7..013949a33 100644
--- a/shared/misc.py
+++ b/shared/misc.py
@@ -1,3 +1,5 @@
+import asyncio
+import functools
 import subprocess
 import time
 import traceback
@@ -10,6 +12,19 @@
 from shared.exceptions import BittensorError
 
 
+# decorator with options
+def async_lru_cache(*lru_cache_args, **lru_cache_kwargs):
+    def async_lru_cache_decorator(async_function):
+        @functools.lru_cache(*lru_cache_args, **lru_cache_kwargs)
+        def cached_async_function(*args, **kwargs):
+            coroutine = async_function(*args, **kwargs)
+            return asyncio.ensure_future(coroutine)
+
+        return cached_async_function
+
+    return async_lru_cache_decorator
+
+
 class classproperty:
     def __init__(self, func: Callable):
         self.fget = func
diff --git a/test.md b/shared/prompts/__init__.py
similarity index 100%
rename from test.md
rename to shared/prompts/__init__.py
diff --git a/shared/prompts/test_time_inference.py b/shared/prompts/test_time_inference.py
new file mode 100644
index 000000000..cca275cdf
--- /dev/null
+++ b/shared/prompts/test_time_inference.py
@@ -0,0 +1,125 @@
+import textwrap
+
+
+def intro_prompt() -> str:
+    """
+    Returns the intro prompt.
+    """
+
+    intro = textwrap.dedent(
+        """\
+    You are a world-class expert in analytical reasoning and problem-solving. Your task is to break down complex problems through rigorous step-by-step analysis, carefully examining each aspect before moving forward. For each reasoning step:
+
+    OUTPUT FORMAT:
+    Return a JSON object with these required fields:
+    {
+        "title": "Brief, descriptive title of current reasoning phase",
+        "content": "Detailed explanation of your analysis",
+        "next_action": "continue" or "final_answer"
+    }
+
+    REASONING PROCESS:
+    1. Initial Analysis
+    - Break down the problem into core components
+    - Identify key constraints and requirements
+    - List relevant domain knowledge and principles
+
+    2. Multiple Perspectives
+    - Examine the problem from at least 3 different angles
+    - Consider both conventional and unconventional approaches
+    - Identify potential biases in initial assumptions
+
+    3. Exploration & Validation
+    - Test preliminary conclusions against edge cases
+    - Apply domain-specific best practices
+    - Quantify confidence levels when possible (e.g., 90% certain)
+    - Document key uncertainties or limitations
+
+    4. Critical Review
+    - Actively seek counterarguments to your reasoning
+    - Identify potential failure modes
+    - Consider alternative interpretations of the data/requirements
+    - Validate assumptions against provided context
+
+    5. Synthesis & Refinement
+    - Combine insights from multiple approaches
+    - Strengthen weak points in the reasoning chain
+    - Address identified edge cases and limitations
+    - Build towards a comprehensive solution
+
+    REQUIREMENTS:
+    - Each step must focus on ONE specific aspect of reasoning
+    - Explicitly state confidence levels and uncertainty
+    - When evaluating options, use concrete criteria
+    - Include specific examples or scenarios when relevant
+    - Acknowledge limitations in your knowledge or capabilities
+    - Maintain logical consistency across steps
+    - Build on previous steps while avoiding redundancy
+
+    CRITICAL THINKING CHECKLIST:
+    ✓ Have I considered non-obvious interpretations?
+    ✓ Are my assumptions clearly stated and justified?
+    ✓ Have I identified potential failure modes?
+    ✓ Is my confidence level appropriate given the evidence?
+    ✓ Have I adequately addressed counterarguments?
+
+    Remember: Quality of reasoning is more important than speed. Take the necessary steps to build a solid analytical foundation before moving to conclusions.
+
+    Example:
+
+    User Query: How many piano tuners are in New York City?
+
+    {Expected Answer:
+    {
+        "title": "Estimating the Number of Piano Tuners in New York City",
+        "content": "To estimate the number of piano tuners in NYC, we need to break down the problem into core components. Key factors include the total population of NYC, the number of households with pianos, the average number of pianos per household, and the frequency of piano tuning. We should also consider the number of professional piano tuners and their workload.",
+        "next_action": "continue"
+    }}
+    """
+    ).strip()
+
+    return intro
+
+
+def system_acceptance_prompt() -> str:
+    """
+    Returns the system acceptance prompt.
+    """
+
+    system_acceptance = textwrap.dedent(
+        """\
+    I understand. I will now analyze the problem systematically, following the structured reasoning process while maintaining high standards of analytical rigor and self-criticism.
+    """
+    ).strip()
+
+    return system_acceptance
+
+
+def final_answer_prompt() -> str:
+    """
+    Returns the final answer prompt.
+    """
+
+    final_answer = textwrap.dedent(
+        """\
+        Review your previous reasoning steps and synthesize them into a final answer.
+        Your response should:
+
+        1. Clearly state your final conclusion.
+        2. Summarize the key reasoning and evidence from previous steps.
+        3. Address any remaining uncertainties or alternative perspectives.
+        4. Note any relevant caveats or limitations to your conclusion.
+
+        Ensure the response is concise, well-structured, and avoids unnecessary repetition.
+        Do not include explicit confidence levels or probabilities.
+
+        Format your response as valid JSON:
+        {{
+            "title": "Final Answer",
+            "content": "Your synthesized conclusion and explanation here.",
+            "next_action": "final_answer"
+        }}
+        """
+    ).strip()
+
+    return final_answer
diff --git a/shared/settings.py b/shared/settings.py
index e2cb11d3b..81aeb5253 100644
--- a/shared/settings.py
+++ b/shared/settings.py
@@ -125,7 +125,7 @@ class SharedSettings(BaseSettings):
     )
     TEST_MINER_IDS: list[int] = Field([], env="TEST_MINER_IDS")
     SUBTENSOR_NETWORK: Optional[str] = Field(None, env="SUBTENSOR_NETWORK")
-    MAX_ALLOWED_VRAM_GB: int = Field(62, env="MAX_ALLOWED_VRAM_GB")
+    MAX_ALLOWED_VRAM_GB: float = Field(62, env="MAX_ALLOWED_VRAM_GB")
     LLM_MAX_MODEL_LEN: int = Field(4096, env="LLM_MAX_MODEL_LEN")
     PROXY_URL: Optional[str] = Field(None, env="PROXY_URL")
     LLM_MODEL: str = Field("hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4", env="LLM_MODEL")
diff --git a/tests/prompting/rewards/test_multi_choice.py b/tests/prompting/rewards/test_multi_choice.py
index 6c0b7ce33..14b173131 100644
--- a/tests/prompting/rewards/test_multi_choice.py
+++ b/tests/prompting/rewards/test_multi_choice.py
@@ -33,7 +33,7 @@ class DendriteResponseEvent:
 
 
 @pytest.mark.parametrize("response, reference, expected", test_cases)
-def test_logit_scoring(response, reference, expected):
+async def test_logit_scoring(response, reference, expected):
     model = MultiChoiceRewardModel(json_penalty=JSON_PENALTY)
-    result = model.reward(reference, DendriteResponseEvent(completions=[response])).rewards[0]
+    result = await model.reward(reference, DendriteResponseEvent(completions=[response])).rewards[0]
     assert result == pytest.approx(expected), f"Failed for input: {response}, reference: {reference}"
diff --git a/tests/prompting/rewards/test_web_retrieval.py b/tests/prompting/rewards/test_web_retrieval.py
index 6270cb321..cfc1f669d 100644
--- a/tests/prompting/rewards/test_web_retrieval.py
+++ b/tests/prompting/rewards/test_web_retrieval.py
@@ -38,7 +38,7 @@ def test_parse_response(completion, expected_url, expected_content, expected_rel
         assert response[0].relevant == expected_relevant
 
 
-def test_cosine_similarity_identical_embeddings():
+async def test_cosine_similarity_identical_embeddings():
     # Mock identical embeddings.
     mock_embedding_model = MagicMock()
     mock_embedding_model.encode.return_value = np.array([1, 2, 3])
@@ -46,11 +46,11 @@ def test_cosine_similarity_identical_embeddings():
     model = WebRetrievalRewardModel()
     model.embedding_model = mock_embedding_model
 
-    similarity = model._cosine_similarity("content1", "content1")
+    similarity = await model._cosine_similarity("content1", "content1")
     assert similarity == pytest.approx(1.0)
 
 
-def test_cosine_similarity_orthogonal_embeddings():
+async def test_cosine_similarity_orthogonal_embeddings():
     # Mock orthogonal embeddings.
     def encode_mock(text, to_numpy):
         return np.array([1, 0]) if text == "content1" else np.array([0, 1])
@@ -61,7 +61,7 @@ def encode_mock(text, to_numpy):
     model = WebRetrievalRewardModel()
     model.embedding_model = mock_embedding_model
 
-    similarity = model._cosine_similarity("content1", "content2")
+    similarity = await model._cosine_similarity("content1", "content2")
     assert similarity == pytest.approx(0.0)
 
 
diff --git a/tests/prompting/shared/test_get_prompt.py b/tests/prompting/shared/test_get_prompt.py
new file mode 100644
index 000000000..9d089eb8c
--- /dev/null
+++ b/tests/prompting/shared/test_get_prompt.py
@@ -0,0 +1,29 @@
+from shared.prompts.test_time_inference import final_answer_prompt, intro_prompt, system_acceptance_prompt
+
+
+def test_intro_prompt():
+    """Test that intro_prompt returns the correct prompt."""
+    prompt = intro_prompt()
+    assert isinstance(prompt, str)
+    assert "You are a world-class expert in analytical reasoning" in prompt
+    assert "OUTPUT FORMAT:" in prompt
+    assert "REASONING PROCESS:" in prompt
+    assert "REQUIREMENTS:" in prompt
+    assert "CRITICAL THINKING CHECKLIST:" in prompt
+
+
+def test_system_acceptance_prompt():
+    """Test that system_acceptance_prompt returns the correct prompt."""
+    prompt = system_acceptance_prompt()
+    assert isinstance(prompt, str)
+    assert "I understand. I will now analyze the problem systematically" in prompt
+
+
+def test_final_answer_prompt():
+    """Test that final_answer_prompt returns the correct prompt."""
+    prompt = final_answer_prompt()
+    assert isinstance(prompt, str)
+    assert "Review your previous reasoning steps" in prompt
+    assert "Format your response as valid JSON" in prompt
+    assert '"title":' in prompt
+    assert '"content":' in prompt
diff --git a/validator_api/api.py b/validator_api/api.py
index 89eb5eda3..d31da3a0a 100644
--- a/validator_api/api.py
+++ b/validator_api/api.py
@@ -3,6 +3,7 @@
 
 import uvicorn
 from fastapi import FastAPI
+from loguru import logger
 
 from shared import settings
 
@@ -17,29 +18,62 @@
 
 @contextlib.asynccontextmanager
 async def lifespan(app: FastAPI):
+    if shared_settings.SCORE_ORGANICS:
+        scoring_task = asyncio.create_task(scoring_queue.scoring_queue.start())
     miner_task = asyncio.create_task(update_miner_availabilities_for_api.start())
-    scoring_task = asyncio.create_task(scoring_queue.scoring_queue.start())
     yield
     miner_task.cancel()
-    scoring_task.cancel()
+    if shared_settings.SCORE_ORGANICS:
+        scoring_task.cancel()
     try:
         await miner_task
-        await scoring_task
+        if shared_settings.SCORE_ORGANICS:
+            await scoring_task
     except asyncio.CancelledError:
         pass
 
 
-app = FastAPI(lifespan=lifespan)
+app = FastAPI(
+    title="Validator API",
+    description="API for interacting with the validator network and miners",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc",
+    openapi_url="/openapi.json",
+    openapi_tags=[
+        {
+            "name": "GPT Endpoints",
+            "description": "Endpoints for chat completions, web retrieval, and test time inference",
+        },
+        {
+            "name": "API Management",
+            "description": "Endpoints for API key management and validation",
+        },
+    ],
+    lifespan=lifespan,
+)
 app.include_router(gpt_router, tags=["GPT Endpoints"])
 app.include_router(api_management_router, tags=["API Management"])
 
 
-@app.get("/health")
+@app.get(
+    "/health",
+    summary="Health check endpoint",
+    description="Simple endpoint to check if the API is running",
+    tags=["Health"],
+    response_description="Status of the API",
+)
 async def health():
+    """
+    Health check endpoint to verify the API is operational.
+
+    Returns a simple JSON object with status "ok" if the API is running.
+    """
     return {"status": "ok"}
 
 
 async def main():
+    logger.info(f"Starting API with {shared_settings.WORKERS} worker(s).")
     config = uvicorn.Config(
         "validator_api.api:app",
         host=shared_settings.API_HOST,
diff --git a/validator_api/api_management.py b/validator_api/api_management.py
index f3af427f0..2604e32f7 100644
--- a/validator_api/api_management.py
+++ b/validator_api/api_management.py
@@ -39,11 +39,30 @@ def validate_admin_key(admin_key: str = Header(...)):
         raise HTTPException(status_code=403, detail="Invalid admin key")
 
 
-# Dependency to validate API keys
-def validate_api_key(api_key: str = Header(...)):
-    if api_key not in _keys:
-        raise HTTPException(status_code=403, detail="Invalid API key")
-    return _keys[api_key]
+def validate_api_key(
+    api_key: str | None = Header(None),
+    authorization: str | None = Header(None),
+):
+    """
+    1) If 'api_key' header exists (the old style), validate it.
+    2) Else, if 'Authorization' header exists and starts with Bearer, extract token and validate.
+    3) Otherwise, raise a 403.
+    """
+
+    if api_key:
+        if api_key not in _keys:
+            raise HTTPException(status_code=403, detail="Invalid API key")
+        return _keys[api_key]
+
+    if authorization:
+        scheme, _, token = authorization.partition(" ")
+        if scheme.lower() != "bearer":
+            raise HTTPException(status_code=403, detail="Invalid authorization scheme")
+        if token not in _keys:
+            raise HTTPException(status_code=403, detail="Invalid API key")
+        return _keys[token]
+
+    raise HTTPException(status_code=403, detail="Missing API key")
 
 
 @router.post("/create-api-key/")
diff --git a/validator_api/chat_completion.py b/validator_api/chat_completion.py
index 774dd2b6b..368f8ba6d 100644
--- a/validator_api/chat_completion.py
+++ b/validator_api/chat_completion.py
@@ -98,6 +98,7 @@ async def stream_from_first_response(
 ) -> AsyncGenerator[str, None]:
     first_valid_response = None
     response_start_time = time.monotonic()
+
     try:
         # Keep looping until we find a valid response or run out of tasks
         while responses and first_valid_response is None:
@@ -245,11 +246,20 @@ async def chat_completion(
     collected_chunks_list = [[] for _ in uids]
     timings_list = [[] for _ in uids]
 
-    if not body.get("sampling_parameters"):
-        raise HTTPException(status_code=422, detail="Sampling parameters are required")
     timeout_seconds = max(
-        30, max(0, math.floor(math.log2(body["sampling_parameters"].get("max_new_tokens", 256) / 256))) * 10 + 30
+        30,
+        max(
+            0,
+            math.floor(
+                math.log2(
+                    body.get("sampling_parameters", shared_settings.SAMPLING_PARAMS).get("max_new_tokens", 256) / 256
+                )
+            ),
+        )
+        * 10
+        + 30,
     )
+
     if STREAM:
         # Create tasks for all miners
         response_tasks = [
@@ -297,7 +307,7 @@ async def chat_completion(
             raise HTTPException(status_code=502, detail="No valid response received")
 
         asyncio.create_task(
-            collect_remainin_nonstream_responses(
+            collect_remaining_nonstream_responses(
                 pending=pending,
                 collected_responses=collected_responses,
                 body=body,
@@ -308,7 +318,7 @@ async def chat_completion(
         return first_valid_response[0]  # Return only the response object, not the chunks
 
 
-async def collect_remainin_nonstream_responses(
+async def collect_remaining_nonstream_responses(
     pending: set[asyncio.Task],
     collected_responses: list,
     body: dict,
@@ -316,6 +326,7 @@ async def collect_remainin_nonstream_responses(
     timings_list: list,
 ):
     """Wait for all pending miner tasks to complete and append their responses to the scoring queue."""
+
     try:
         # Wait for all remaining tasks; allow exceptions to be returned.
         remaining_responses = await asyncio.gather(*pending, return_exceptions=True)
diff --git a/validator_api/gpt_endpoints.py b/validator_api/gpt_endpoints.py
index 9dbdffb51..bc0d3e262 100644
--- a/validator_api/gpt_endpoints.py
+++ b/validator_api/gpt_endpoints.py
@@ -5,7 +5,7 @@
 import uuid
 
 import numpy as np
-from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi import APIRouter, Depends, HTTPException, status
 from loguru import logger
 from openai.types.chat.chat_completion_chunk import ChatCompletionChunk, Choice, ChoiceDelta
 from starlette.responses import StreamingResponse
@@ -18,6 +18,13 @@
 from validator_api.api_management import validate_api_key
 from validator_api.chat_completion import chat_completion
 from validator_api.mixture_of_miners import mixture_of_miners
+from validator_api.serializers import (
+    CompletionsRequest,
+    TestTimeInferenceRequest,
+    WebRetrievalRequest,
+    WebRetrievalResponse,
+    WebSearchResult,
+)
 from validator_api.test_time_inference import generate_response
 from validator_api.utils import filter_available_uids
 
@@ -25,15 +32,68 @@
 N_MINERS = 5
 
 
-@router.post("/v1/chat/completions")
-async def completions(request: Request, api_key: str = Depends(validate_api_key)):
-    """Main endpoint that handles both regular and mixture of miners chat completion."""
+@router.post(
+    "/v1/chat/completions",
+    summary="Chat completions endpoint",
+    description="Main endpoint that handles both regular, multi step reasoning, test time inference, and mixture of miners chat completion.",
+    response_description="Streaming response with generated text",
+    status_code=status.HTTP_200_OK,
+    responses={
+        status.HTTP_200_OK: {
+            "description": "Successful response with streaming text",
+            "content": {"text/event-stream": {}},
+        },
+        status.HTTP_500_INTERNAL_SERVER_ERROR: {"description": "Internal server error or no available miners"},
+    },
+)
+async def completions(request: CompletionsRequest, api_key: str = Depends(validate_api_key)):
+    """
+    Chat completions endpoint that supports different inference modes.
+
+    This endpoint processes chat messages and returns generated completions using
+    different inference strategies based on the request parameters.
+
+    ## Inference Modes:
+    - Regular chat completion
+    - Multi Step Reasoning
+    - Test time inference
+    - Mixture of miners
+
+    ## Request Parameters:
+    - **uids** (List[int], optional): Specific miner UIDs to query. If not provided, miners will be selected automatically.
+    - **messages** (List[dict]): List of message objects with 'role' and 'content' keys. Required.
+    - **seed** (int, optional): Random seed for reproducible results.
+    - **task** (str, optional): Task identifier to filter available miners.
+    - **model** (str, optional): Model identifier to filter available miners.
+    - **test_time_inference** (bool, default=False): Enable step-by-step reasoning mode.
+    - **mixture** (bool, default=False): Enable mixture of miners mode.
+    - **sampling_parameters** (dict, optional): Parameters to control text generation.
+
+    The endpoint selects miners based on the provided UIDs or filters available miners
+    based on task and model requirements.
+
+    Example request:
+    ```json
+    {
+      "messages": [
+        {"role": "user", "content": "Tell me about neural networks"}
+      ],
+      "model": "gpt-4",
+      "seed": 42
+    }
+    ```
+    """
     try:
-        body = await request.json()
+        body = request.model_dump()
+        if body.get("inference_mode") == "Reasoning-Fast":
+            body["task"] = "MultiStepReasoningTask"
+        if body.get("model") == "Default":
+            # By setting default, we are allowing a user to use whatever model we define as the standard, could also set to None.
+            body["model"] = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
         body["seed"] = int(body.get("seed") or random.randint(0, 1000000))
         if body.get("uids"):
             try:
-                uids = [int(uid) for uid in body.get("uids")]
+                uids = list(map(int, body.get("uids")))
             except Exception:
                 logger.error(f"Error in uids: {body.get('uids')}")
         else:
@@ -43,10 +103,15 @@ async def completions(request: Request, api_key: str = Depends(validate_api_key)
         if not uids:
             raise HTTPException(status_code=500, detail="No available miners")
 
-        # Choose between regular completion and mixture of miners.
-        if body.get("test_time_inference", False):
-            return await test_time_inference(body["messages"], body.get("model", None), target_uids=uids)
-        if body.get("mixture", False):
+        if body.get("test_time_inference", False) or body.get("inference_mode", None) == "Chain-of-Thought":
+            test_time_request = TestTimeInferenceRequest(
+                messages=request.messages,
+                model=request.model,
+                uids=uids if uids else None,
+                json_format=request.json_format,
+            )
+            return await test_time_inference(test_time_request)
+        elif body.get("mixture", False) or body.get("inference_mode", None) == "Mixture-of-Agents":
             return await mixture_of_miners(body, uids=uids)
         else:
             return await chat_completion(body, uids=uids)
@@ -56,24 +121,66 @@ async def completions(request: Request, api_key: str = Depends(validate_api_key)
         return StreamingResponse(content="Internal Server Error", status_code=500)
 
 
-@router.post("/web_retrieval")
+@router.post(
+    "/web_retrieval",
+    response_model=WebRetrievalResponse,
+    summary="Web retrieval endpoint",
+    description="Retrieves information from the web based on a search query using multiple miners.",
+    response_description="List of unique web search results",
+    status_code=status.HTTP_200_OK,
+    responses={
+        status.HTTP_200_OK: {
+            "description": "Successful response with web search results",
+            "model": WebRetrievalResponse,
+        },
+        status.HTTP_500_INTERNAL_SERVER_ERROR: {
+            "description": "Internal server error, no available miners, or no successful miner responses"
+        },
+    },
+)
 async def web_retrieval(
-    search_query: str,
-    n_miners: int = 10,
-    n_results: int = 5,
-    max_response_time: int = 10,
+    request: WebRetrievalRequest,
     api_key: str = Depends(validate_api_key),
-    target_uids: list[str] = None,
 ):
-    if target_uids:
-        uids = target_uids
+    """
+    Web retrieval endpoint that queries multiple miners to search the web.
+
+    This endpoint distributes a search query to multiple miners, which perform web searches
+    and return relevant results. The results are deduplicated based on URLs before being returned.
+
+    ## Request Parameters:
+    - **search_query** (str): The query to search for on the web. Required.
+    - **n_miners** (int, default=10): Number of miners to query for results.
+    - **n_results** (int, default=5): Maximum number of results to return in the response.
+    - **max_response_time** (int, default=10): Maximum time to wait for responses in seconds.
+    - **uids** (List[int], optional): Optional list of specific miner UIDs to query.
+
+    ## Response:
+    Returns a list of unique web search results, each containing:
+    - **url** (str): The URL of the web page
+    - **content** (str, optional): The relevant content from the page
+    - **relevant** (str, optional): Information about why this result is relevant
+
+    Example request:
+    ```json
+    {
+      "search_query": "latest advancements in quantum computing",
+      "n_miners": 15,
+      "n_results": 10
+    }
+    ```
+    """
+    if request.uids:
+        uids = request.uids
         try:
-            uids = [int(uid) for uid in uids]
+            uids = list(map(int, uids))
         except Exception:
             logger.error(f"Error in uids: {uids}")
     else:
-        uids = filter_available_uids(task="WebRetrievalTask", test=shared_settings.API_TEST_MODE, n_miners=n_miners)
-        uids = random.sample(uids, min(len(uids), n_miners))
+        uids = filter_available_uids(
+            task="WebRetrievalTask", test=shared_settings.API_TEST_MODE, n_miners=request.n_miners
+        )
+        uids = random.sample(uids, min(len(uids), request.n_miners))
 
     if len(uids) == 0:
         raise HTTPException(status_code=500, detail="No available miners")
@@ -82,14 +189,15 @@ async def web_retrieval(
         "seed": random.randint(0, 1_000_000),
         "sampling_parameters": shared_settings.SAMPLING_PARAMS,
         "task": "WebRetrievalTask",
-        "target_results": n_results,
-        "timeout": max_response_time,
+        "target_results": request.n_results,
+        "timeout": request.max_response_time,
         "messages": [
-            {"role": "user", "content": search_query},
+            {"role": "user", "content": request.search_query},
         ],
     }
 
-    timeout_seconds = 30
+    timeout_seconds = 30  # TODO: We need to scale down this timeout
+    logger.debug(f"🔍 Querying miners: {uids} for web retrieval")
     stream_results = await query_miners(uids, body, timeout_seconds)
     results = [
         "".join(res.accumulated_chunks)
@@ -109,13 +217,56 @@ async def web_retrieval(
 
     collected_chunks_list = [res.accumulated_chunks if res and res.accumulated_chunks else [] for res in stream_results]
     asyncio.create_task(scoring_queue.scoring_queue.append_response(uids=uids, body=body, chunks=collected_chunks_list))
-    return loaded_results
+    loaded_results = [json.loads(r) if isinstance(r, str) else r for r in loaded_results]
+    flat_results = [item for sublist in loaded_results for item in sublist]
+    unique_results = []
+    seen_urls = set()
+
+    for result in flat_results:
+        if isinstance(result, dict) and "url" in result:
+            if result["url"] not in seen_urls:
+                seen_urls.add(result["url"])
+                # Convert dict to WebSearchResult
+                unique_results.append(WebSearchResult(**result))
+
+    return WebRetrievalResponse(results=unique_results)
+
 
+async def test_time_inference(request: TestTimeInferenceRequest):
+    """
+    Test time inference endpoint that provides step-by-step reasoning.
+
+    This endpoint streams the thinking process and reasoning steps during inference,
+    allowing visibility into how the model arrives at its conclusions. Each step of
+    the reasoning process is streamed as it becomes available.
+
+    ## Request Parameters:
+    - **messages** (List[dict]): List of message objects with 'role' and 'content' keys. Required.
+    - **model** (str, optional): Optional model identifier to use for inference.
+    - **uids** (List[int], optional): Optional list of specific miner UIDs to query.
+
+    ## Response:
+    The response is streamed as server-sent events (SSE) with each step of reasoning.
+    Each event contains:
+    - A step title/heading
+    - The content of the reasoning step
+    - Timing information (debug only)
+
+    Example request:
+    ```json
+    {
+      "messages": [
+        {"role": "user", "content": "Solve the equation: 3x + 5 = 14"}
+      ],
+      "model": "gpt-4"
+    }
+    ```
+    """
 
-@router.post("/test_time_inference")
-async def test_time_inference(messages: list[dict], model: str = None, target_uids: list[str] = None):
-    async def create_response_stream(messages):
-        async for steps, total_thinking_time in generate_response(messages, model=model, target_uids=target_uids):
+    async def create_response_stream(request):
+        async for steps, total_thinking_time in generate_response(
+            request.messages, model=request.model, uids=request.uids
+        ):
             if total_thinking_time is not None:
                 logger.debug(f"**Total thinking time: {total_thinking_time:.2f} seconds**")
             yield steps, total_thinking_time
@@ -124,16 +275,18 @@ async def create_response_stream(messages):
     async def stream_steps():
         try:
             i = 0
-            async for steps, thinking_time in create_response_stream(messages):
+            async for steps, thinking_time in create_response_stream(request):
                 i += 1
+                if request.json_format:
+                    choice = Choice(index=i, delta=ChoiceDelta(content=json.dumps(steps[-1])))
+                else:
+                    choice = Choice(index=i, delta=ChoiceDelta(content=f"## {steps[-1][0]}\n\n{steps[-1][1]}" + "\n\n"))
                 yield "data: " + ChatCompletionChunk(
                     id=str(uuid.uuid4()),
                     created=int(time.time()),
-                    model=model or "None",
+                    model=request.model or "None",
                     object="chat.completion.chunk",
-                    choices=[
-                        Choice(index=i, delta=ChoiceDelta(content=f"## {steps[-1][0]}\n\n{steps[-1][1]}" + "\n\n"))
-                    ],
+                    choices=[choice],
                 ).model_dump_json() + "\n\n"
         except Exception as e:
             logger.exception(f"Error during streaming: {e}")
diff --git a/validator_api/mixture_of_miners.py b/validator_api/mixture_of_miners.py
index b5742e185..e26c018d3 100644
--- a/validator_api/mixture_of_miners.py
+++ b/validator_api/mixture_of_miners.py
@@ -7,6 +7,7 @@
 from fastapi.responses import StreamingResponse
 from loguru import logger
 
+from shared.settings import shared_settings
 from shared.uids import get_uids
 from validator_api.chat_completion import chat_completion, get_response_from_miner
 
@@ -45,6 +46,7 @@ async def mixture_of_miners(body: dict[str, any], uids: list[int]) -> tuple | St
     if len(uids) == 0:
         raise HTTPException(status_code=503, detail="No available miners found")
 
+    body["sampling_parameters"] = body.get("sampling_parameters", shared_settings.SAMPLING_PARAMS)
     # Concurrently collect responses from all miners.
     timeout_seconds = max(
         30, max(0, math.floor(math.log2(body["sampling_parameters"].get("max_new_tokens", 256) / 256))) * 10 + 30
diff --git a/validator_api/scoring_queue.py b/validator_api/scoring_queue.py
index 781121319..92ad5b704 100644
--- a/validator_api/scoring_queue.py
+++ b/validator_api/scoring_queue.py
@@ -42,6 +42,7 @@ async def wait_for_next_execution(self, last_run_time) -> datetime.datetime:
 
     async def run_step(self):
         """Perform organic scoring: pop queued payload, forward to the validator API."""
+        logger.debug("Running scoring step")
         async with self._scoring_lock:
             if not self._scoring_queue:
                 return
@@ -57,6 +58,11 @@ async def run_step(self):
         except Exception as e:
             logger.exception(f"Could not find available validator scoring endpoint: {e}")
         try:
+            if hasattr(payload, "to_dict"):
+                payload = payload.to_dict()
+            elif isinstance(payload, BaseModel):
+                payload = payload.model_dump()
+
             timeout = httpx.Timeout(timeout=120.0, connect=60.0, read=30.0, write=30.0, pool=5.0)
             # Add required headers for signature verification
 
@@ -74,6 +80,8 @@ async def run_step(self):
                     # Raise an exception so that the retry logic in the except block handles it.
                     raise Exception(f"Non-200 response: {response.status_code} for uids {uids}")
                 logger.debug(f"Forwarding response completed with status {response.status_code}")
+        except httpx.ConnectError as e:
+            logger.warning(f"Couldn't connect to validator {url} for Scoring {uids}. Exception: {e}")
         except Exception as e:
             if scoring_payload.retries < self.max_scoring_retries:
                 scoring_payload.retries += 1
@@ -93,7 +101,7 @@ async def append_response(
             # logger.debug(f"Skipping forwarding for non-inference/web retrieval task: {body.get('task')}")
             return
 
-        uids = [int(u) for u in uids]
+        uids = list(map(int, uids))
         chunk_dict = {str(u): c for u, c in zip(uids, chunks)}
         if timings:
             timing_dict = {str(u): t for u, t in zip(uids, timings)}
diff --git a/validator_api/serializers.py b/validator_api/serializers.py
new file mode 100644
index 000000000..74e2706cb
--- /dev/null
+++ b/validator_api/serializers.py
@@ -0,0 +1,125 @@
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Field
+
+
+class CompletionsRequest(BaseModel):
+    """Request model for the /v1/chat/completions endpoint."""
+
+    uids: Optional[List[int]] = Field(
+        default=None,
+        description="List of specific miner UIDs to query. If not provided, miners will be selected automatically.",
+        example=[1, 2, 3],
+    )
+    messages: List[Dict[str, str]] = Field(
+        ...,
+        description="List of message objects with 'role' and 'content' keys. Roles can be 'system', 'user', or 'assistant'.",
+        example=[{"role": "user", "content": "Tell me about neural networks"}],
+    )
+    seed: Optional[int] = Field(
+        default=None,
+        description="Random seed for reproducible results. If not provided, a random seed will be generated.",
+        example=42,
+    )
+    task: Optional[str] = Field(
+        default="InferenceTask", description="Task identifier to choose the inference type.", example="InferenceTask"
+    )
+    model: Optional[str] = Field(
+        default=None,
+        description="Model identifier to filter available miners.",
+        example="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4",
+    )
+    test_time_inference: bool = Field(
+        default=False, description="Enable step-by-step reasoning mode that shows the model's thinking process."
+    )
+    mixture: bool = Field(
+        default=False, description="Enable mixture of miners mode that combines responses from multiple miners."
+    )
+    sampling_parameters: Optional[Dict[str, Any]] = Field(
+        default={
+            "temperature": 0.7,
+            "top_p": 0.95,
+            "top_k": 50,
+            "max_new_tokens": 1024,
+            "do_sample": True,
+        },
+        description="Parameters to control text generation, such as temperature, top_p, etc.",
+        example={
+            "temperature": 0.7,
+            "top_p": 0.95,
+            "top_k": 50,
+            "max_new_tokens": 512,
+            "do_sample": True,
+        },
+    )
+    inference_mode: Optional[str] = Field(
+        default=None,
+        description="Inference mode to use for the task.",
+        example="Reasoning-Fast",
+    )
+    json_format: bool = Field(default=False, description="Enable JSON format for the response.", example=True)
+
+
+class WebRetrievalRequest(BaseModel):
+    """Request model for the /web_retrieval endpoint."""
+
+    uids: Optional[List[int]] = Field(
+        default=None,
+        description="List of specific miner UIDs to query. If not provided, miners will be selected automatically.",
+        example=[1, 2, 3],
+    )
+    search_query: str = Field(
+        ..., description="The query to search for on the web.", example="latest advancements in quantum computing"
+    )
+    n_miners: int = Field(default=3, description="Number of miners to query for results.", example=15, ge=1)
+    n_results: int = Field(
+        default=1, description="Maximum number of results to return in the response.", example=5, ge=1
+    )
+    max_response_time: int = Field(
+        default=10, description="Maximum time to wait for responses in seconds.", example=15, ge=1
+    )
+
+
+class WebSearchResult(BaseModel):
+    """Model for a single web search result."""
+
+    url: str = Field(..., description="The URL of the web page.", example="https://example.com/article")
+    content: Optional[str] = Field(
+        default=None,
+        description="The relevant content extracted from the page.",
+        example="Quantum computing has seen significant advancements in the past year...",
+    )
+    relevant: Optional[str] = Field(
+        default=None,
+        description="Information about why this result is relevant to the query.",
+        example="This article discusses the latest breakthroughs in quantum computing research.",
+    )
+
+
+class WebRetrievalResponse(BaseModel):
+    """Response model for the /web_retrieval endpoint."""
+
+    results: List[WebSearchResult] = Field(..., description="List of unique web search results.")
+
+    def to_dict(self):
+        return self.model_dump().update({"results": [r.model_dump() for r in self.results]})
+
+
+class TestTimeInferenceRequest(BaseModel):
+    """Request model for the /test_time_inference endpoint."""
+
+    uids: Optional[List[int]] = Field(
+        default=None,
+        description="List of specific miner UIDs to query. If not provided, miners will be selected automatically.",
+        example=[1, 2, 3],
+    )
+    messages: List[Dict[str, str]] = Field(
+        ...,
+        description="List of message objects with 'role' and 'content' keys. Roles can be 'system', 'user', or 'assistant'.",
+        example=[{"role": "user", "content": "Solve the equation: 3x + 5 = 14"}],
+    )
+    model: Optional[str] = Field(default=None, description="Model identifier to use for inference.", example="gpt-4")
+    json_format: bool = Field(default=False, description="Enable JSON format for the response.", example=True)
+
+    def to_dict(self):
+        return self.model_dump().update({"messages": [m.model_dump() for m in self.messages]})
diff --git a/validator_api/test_time_inference.py b/validator_api/test_time_inference.py
index 19cd8fae2..a60d78c51 100644
--- a/validator_api/test_time_inference.py
+++ b/validator_api/test_time_inference.py
@@ -8,6 +8,7 @@
 
 from prompting.llms.apis.llm_messages import LLMMessage, LLMMessages
 from prompting.llms.apis.llm_wrapper import LLMWrapper
+from shared.prompts.test_time_inference import final_answer_prompt, intro_prompt, system_acceptance_prompt
 from shared.timer import Timer
 from validator_api.chat_completion import chat_completion
 
@@ -37,12 +38,27 @@ def parse_multiple_json(api_response):
             # Replace escaped single quotes with actual single quotes
             json_str_clean = json_str.replace("\\'", "'")
 
+            # Remove or replace invalid control characters
+            json_str_clean = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F]", "", json_str_clean)
+
             # Parse the JSON string into a dictionary
             obj = json.loads(json_str_clean)
             parsed_objects.append(obj)
         except json.JSONDecodeError as e:
-            print(f"Failed to parse JSON object: {e}")
-            continue
+            logger.warning(f"Failed to parse JSON object: {e}")
+
+            # Try a more aggressive approach if standard cleaning failed
+            try:
+                clean_str = "".join(c if ord(c) >= 32 or c in ["\n", "\r", "\t"] else " " for c in json_str)
+                clean_str = re.sub(r"\s+", " ", clean_str)  # Normalize whitespace
+
+                # Try to parse again
+                obj = json.loads(clean_str)
+                parsed_objects.append(obj)
+                logger.info("Successfully parsed JSON after aggressive cleaning")
+            except json.JSONDecodeError:
+                # If still failing, log and continue
+                continue
 
     if len(parsed_objects) == 0:
         logger.error(
@@ -58,11 +74,12 @@ def parse_multiple_json(api_response):
             f"Invalid JSON object found in the response - field missing. The miner response was: {api_response}"
         )
         return None
+
     return parsed_objects
 
 
 async def make_api_call(
-    messages, model=None, is_final_answer: bool = False, use_miners: bool = True, target_uids: list[str] = None
+    messages, model=None, is_final_answer: bool = False, use_miners: bool = True, uids: list[int] | None = None
 ):
     async def single_attempt():
         try:
@@ -81,7 +98,7 @@ async def single_attempt():
                         "seed": random.randint(0, 1000000),
                     },
                     num_miners=3,
-                    uids=target_uids,
+                    uids=uids,
                 )
                 response_str = response.choices[0].message.content
             else:
@@ -147,74 +164,19 @@ async def single_attempt():
 
 
 async def generate_response(
-    original_messages: list[dict[str, str]], model: str = None, target_uids: list[str] = None, use_miners: bool = True
+    original_messages: list[dict[str, str]], model: str = None, uids: list[int] | None = None, use_miners: bool = True
 ):
     messages = [
         {
             "role": "system",
-            "content": """You are a world-class expert in analytical reasoning and problem-solving. Your task is to break down complex problems through rigorous step-by-step analysis, carefully examining each aspect before moving forward. For each reasoning step:
-
-OUTPUT FORMAT:
-Return a JSON object with these required fields:
-{
-    "title": "Brief, descriptive title of current reasoning phase",
-    "content": "Detailed explanation of your analysis",
-    "next_action": "continue" or "final_answer"
-}
-
-REASONING PROCESS:
-1. Initial Analysis
-   - Break down the problem into core components
-   - Identify key constraints and requirements
-   - List relevant domain knowledge and principles
-
-2. Multiple Perspectives
-   - Examine the problem from at least 3 different angles
-   - Consider both conventional and unconventional approaches
-   - Identify potential biases in initial assumptions
-
-3. Exploration & Validation
-   - Test preliminary conclusions against edge cases
-   - Apply domain-specific best practices
-   - Quantify confidence levels when possible (e.g., 90% certain)
-   - Document key uncertainties or limitations
-
-4. Critical Review
-   - Actively seek counterarguments to your reasoning
-   - Identify potential failure modes
-   - Consider alternative interpretations of the data/requirements
-   - Validate assumptions against provided context
-
-5. Synthesis & Refinement
-   - Combine insights from multiple approaches
-   - Strengthen weak points in the reasoning chain
-   - Address identified edge cases and limitations
-   - Build towards a comprehensive solution
-
-REQUIREMENTS:
-- Each step must focus on ONE specific aspect of reasoning
-- Explicitly state confidence levels and uncertainty
-- When evaluating options, use concrete criteria
-- Include specific examples or scenarios when relevant
-- Acknowledge limitations in your knowledge or capabilities
-- Maintain logical consistency across steps
-- Build on previous steps while avoiding redundancy
-
-CRITICAL THINKING CHECKLIST:
-✓ Have I considered non-obvious interpretations?
-✓ Are my assumptions clearly stated and justified?
-✓ Have I identified potential failure modes?
-✓ Is my confidence level appropriate given the evidence?
-✓ Have I adequately addressed counterarguments?
-
-Remember: Quality of reasoning is more important than speed. Take the necessary steps to build a solid analytical foundation before moving to conclusions.""",
+            "content": intro_prompt(),
         }
     ]
     messages += original_messages
     messages += [
         {
             "role": "assistant",
-            "content": "I understand. I will now analyze the problem systematically, following the structured reasoning process while maintaining high standards of analytical rigor and self-criticism.",
+            "content": system_acceptance_prompt(),
         }
     ]
 
@@ -224,12 +186,11 @@ async def generate_response(
 
     for _ in range(MAX_THINKING_STEPS):
         with Timer() as timer:
-            step_data = await make_api_call(messages, model=model, use_miners=use_miners, target_uids=target_uids)
+            step_data = await make_api_call(messages, model=model, use_miners=use_miners, uids=uids)
         thinking_time = timer.final_time
         total_thinking_time += thinking_time
 
         steps.append((f"Step {step_count}: {step_data['title']}", step_data["content"], thinking_time))
-
         messages.append({"role": "assistant", "content": json.dumps(step_data)})
 
         if step_data["next_action"] == "final_answer" or not step_data.get("next_action"):
@@ -238,24 +199,15 @@ async def generate_response(
         step_count += 1
         yield steps, None
 
-    final_answer_prompt = """Based on your thorough analysis, please provide your final answer. Your response should:
-
-        1. Clearly state your conclusion
-        2. Summarize the key supporting evidence
-        3. Acknowledge any remaining uncertainties
-        4. Include relevant caveats or limitations"""
-
     messages.append(
         {
             "role": "user",
-            "content": final_answer_prompt,
+            "content": final_answer_prompt(),
         }
     )
 
     start_time = time.time()
-    final_data = await make_api_call(
-        messages, model=model, is_final_answer=True, use_miners=use_miners, target_uids=target_uids
-    )
+    final_data = await make_api_call(messages, model=model, is_final_answer=True, use_miners=use_miners, uids=uids)
 
     end_time = time.time()
     thinking_time = end_time - start_time
diff --git a/validator_api/utils.py b/validator_api/utils.py
index f0b5eb947..ebe69fc6b 100644
--- a/validator_api/utils.py
+++ b/validator_api/utils.py
@@ -8,11 +8,29 @@
 from shared.uids import get_uids
 
 
+def read_fallback_uids() -> dict[str, dict]:
+    try:
+        from collections import defaultdict
+
+        uids = get_uids(sampling_mode="all")
+        return {
+            str(uid): {
+                "task_availabilities": defaultdict(lambda: True),
+                "llm_model_availabilities": defaultdict(lambda: True),
+            }
+            for uid in uids
+        }
+    except Exception as e2:
+        logger.error(f"Error reading miner availabilities from JSON file: {e2}")
+        return {}
+
+
 class UpdateMinerAvailabilitiesForAPI(AsyncLoopRunner):
-    interval: int = 300
+    interval: int = 120
     miner_availabilities: dict[int, dict] = {}
 
     async def run_step(self):
+        logger.debug("Running update miner availabilities step")
         if settings.shared_settings.API_TEST_MODE:
             return
         try:
@@ -22,14 +40,12 @@ async def run_step(self):
                 json=get_uids(sampling_mode="all"),
                 timeout=15,
             )
-
             self.miner_availabilities = response.json()
         except Exception as e:
-            logger.exception(f"Error while updating miner availabilities for API: {e}")
+            logger.error(f"Failed updating miner availabilities for API, fallback to all uids: {e}")
+            self.miner_availabilities = read_fallback_uids()
         tracked_availabilities = [m for m in self.miner_availabilities.values() if m is not None]
-        logger.debug(
-            f"MINER AVAILABILITIES UPDATED, TRACKED: {len(tracked_availabilities)}, UNTRACKED: {len(self.miner_availabilities) - len(tracked_availabilities)}"
-        )
+        logger.info(f"Availabilities updated, tracked: {len(tracked_availabilities)}")
 
 
 update_miner_availabilities_for_api = UpdateMinerAvailabilitiesForAPI()
@@ -79,8 +95,11 @@ def filter_available_uids(
         filtered_uids.append(uid)
 
     if len(filtered_uids) == 0:
-        logger.error("Got empty list of available UIDs. Check VALIDATOR_API and SCORING_KEY in .env.api")
-        return filtered_uids
+        logger.error(
+            "Got an empty list of available UIDs, falling back to all uids. "
+            "Check VALIDATOR_API and SCORING_KEY in .env.api"
+        )
+        filtered_uids = get_uids(sampling_mode="top_incentive", k=n_miners)
 
     filtered_uids = random.sample(filtered_uids, min(len(filtered_uids), n_miners))