allenai · Lucaweihs · Oct 4, 2024 · Jul 30, 2024 · Aug 9, 2024 · Aug 9, 2024
diff --git a/.gitignore b/.gitignore
@@ -167,4 +167,10 @@ tmp
 */log.txt
 
 log.txt
-LICENSE.txt
+LICENSE.txt
+objathor/out/*
+*.jpg
+*.json
+*.json.gz
+examples
+*batch_server_data/
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ Objaverse asset annotator and importer for use in THOR.
 Install ai2thor:
 
 ```bash
-pip install --extra-index-url https://ai2thor-pypi.allenai.org ai2thor==0+40679c517859e09c1f2a5e39b65ee7f33fcfdd48
+pip install --extra-index-url https://ai2thor-pypi.allenai.org ai2thor==0+455cf72a1c8e0759a452422f2128fbc93a3cb06b
 ```
 
 Install other dependencies:
@@ -60,18 +60,6 @@ During the first run, NLTK dependencies are automatically installed, but we can
 python -c "import nltk; nltk.download('punkt'); nltk.download('wordnet2022'); nltk.download('brown'); nltk.download('averaged_perceptron_tagger')"
 ```
 
-
-### Pre-generated synset definition embeddings for Annotation
-
-For automatic annotation to assign likely synsets given the automatically generated asset description, we can
-pre-install pre-generated embeddings for all synset definitions (this can be useful if we cannot write into our home
-directory at run time):
-
-```bash
-mkdir ~/.objathor_data
-curl https://prior-datasets.s3.us-east-2.amazonaws.com/vida-synset-embeddings/synset_definition_embeddings_single.pkl.gz -o ~/.objathor_data/synset_definition_embeddings_single.pkl.gz
-```
-
 ### AI2-THOR binary pre-downloading
 
 Assuming we're running on a remote Linux server, we can pre-download the THOR binaries with:
@@ -104,15 +92,15 @@ From the repository root run:
 ```
 python 
 -m objathor.asset_conversion.pipeline_to_thor 
---object_ids=000074a334c541878360457c672b6c2e 
+--uids=000074a334c541878360457c672b6c2e 
 --output_dir=<some_absolute_path>
 --extension=.msgpack.gz
  --annotations=<annotations_file_path> 
 --live 
 --blender_as_module
 ```
 
-Where `object_ids` is a string of comma separated list of `Objaverse` object ids to process.
+Where `uids` is a string of comma separated list of `Objaverse` ids to process.
 `output_dir` is an absolute path indicating where to write the output of the conversion.
 `annotations` is optional, and is the path to an annotations file as generated by the process described above.
 

diff --git a/objathor/annotation/annotation_utils.py b/objathor/annotation/annotation_utils.py
@@ -1,5 +1,25 @@
 from typing import TypedDict, List, Dict
 
+MODEL_STR_TO_PRICE_PER_1M_INPUT_TOKENS = {
+    # OpenAI models
+    "gpt-3.5-turbo-0301": 1.5,
+    "gpt-3.5-turbo-0125": 1.5,
+    "gpt-4-1106-preview": 10.0,
+    "gpt-4o-2024-05-13": 5.0,
+    "gpt-4o-2024-08-06": 2.5,
+    "gpt-4o-mini-2024-07-18": 0.15,
+}
+
+MODEL_STR_TO_PRICE_PER_1M_OUTPUT_TOKENS = {
+    # OpenAI models
+    "gpt-3.5-turbo-0301": 2.0,
+    "gpt-3.5-turbo-0125": 2.0,
+    "gpt-4-1106-preview": 30.0,
+    "gpt-4o-2024-05-13": 15.0,
+    "gpt-4o-2024-08-06": 10.0,
+    "gpt-4o-mini-2024-07-18": 0.6,
+}
+
 
 class LicenseInfo(TypedDict):
     """
@@ -114,3 +134,18 @@ class ObjectAnnotation(TypedDict):
     scale: float
     z_axis_scale: bool
     license_info: LicenseInfo
+
+
+def compute_llm_cost(input_tokens: int, output_tokens: int, model: str):
+    assert (
+        model in MODEL_STR_TO_PRICE_PER_1M_INPUT_TOKENS
+        and model in MODEL_STR_TO_PRICE_PER_1M_OUTPUT_TOKENS
+    ), f"model [{model}] must be in both MODEL_STR_TO_PRICE_PER_1M_INPUT_TOKENS and MODEL_STR_TO_PRICE_PER_1M_OUTPUT_TOKENS"
+
+    input_token_cost_per_1m = MODEL_STR_TO_PRICE_PER_1M_INPUT_TOKENS[model]
+    output_token_cost_per_1m = MODEL_STR_TO_PRICE_PER_1M_OUTPUT_TOKENS[model]
+
+    return (
+        input_tokens * input_token_cost_per_1m
+        + output_tokens * output_token_cost_per_1m
+    ) / 1e6
diff --git a/objathor/annotation/embed_synset_definitions.py b/objathor/annotation/embed_synset_definitions.py
@@ -1,12 +1,13 @@
 import os
 import random
-import urllib.request
 from typing import Dict
 
 import compress_pickle
 import numpy as np
+from filelock import FileLock
 from tqdm import tqdm
 
+from objathor.utils.download_utils import download_with_locking
 from objathor.utils.gpt_utils import get_embedding, get_embeddings_from_texts
 from objathor.utils.synsets import (
     all_synsets,
@@ -24,27 +25,38 @@
 
 
 def download_embeddings(
-    url: str = "https://prior-datasets.s3.us-east-2.amazonaws.com/vida-synset-embeddings/synset_definition_embeddings_with_lemmas__2024-01-22.pkl.gz",
+    url: str = "https://pub-daedd7738a984186a00f2ab264d06a07.r2.dev/misc/synset_definition_embeddings_with_lemmas__2024-01-22.pkl.gz",
+    retry_if_failure: bool = True,
 ):
-    os.makedirs(OBJATHOR_DATA_DIR, exist_ok=True)
-    if not os.path.isfile(SYNSET_DEFINITION_EMB_FILE):
-        print(f"Downloading\n{url}\nto\n{SYNSET_DEFINITION_EMB_FILE}")
-
-        def report_hook(block, block_size, total_size, freq=1e7):
-            if (block * block_size) % freq < block_size:
-                print(f"{block * block_size / total_size * 100:.2f}% downloaded"),
-
-        urllib.request.urlretrieve(
-            url,
-            SYNSET_DEFINITION_EMB_FILE,
-            reporthook=report_hook,
-        )
-
-        print("Finished downloading")
-    assert os.path.isfile(SYNSET_DEFINITION_EMB_FILE)
+    lock_path = SYNSET_DEFINITION_EMB_FILE + ".lock"
 
+    download_with_locking(
+        url=url,
+        save_path=SYNSET_DEFINITION_EMB_FILE,
+        lock_path=lock_path,
+        desc="Downloading synset definition embeddings",
+    )
 
-def get_embeddings(
+    load_failure = False
+    with FileLock(lock_path):
+        try:
+            compress_pickle.load(SYNSET_DEFINITION_EMB_FILE)
+        except EOFError:
+            if retry_if_failure:
+                load_failure = True
+                try:
+                    os.remove(SYNSET_DEFINITION_EMB_FILE)
+                except FileNotFoundError:
+                    pass
+            else:
+                raise
+
+    if load_failure:
+        print("Failed to load embeddings, reattempting download...")
+        download_embeddings(url=url, retry_if_failure=False)
+
+
+def compute_synset_embeddings(
     fname: str = os.path.join(OBJATHOR_DATA_DIR, "synset_definition_embeddings.pkl.gz"),
 ) -> Dict[str, np.ndarray]:
     from nltk.corpus import wordnet2022 as wn
@@ -90,7 +102,7 @@ def get_embeddings(
     return data
 
 
-def get_embeddings_single(
+def _load_synset_embeddings(
     fname: str = SYNSET_DEFINITION_EMB_FILE,
 ) -> Dict[str, np.ndarray]:
     if not os.path.isfile(fname):
@@ -99,10 +111,11 @@ def get_embeddings_single(
         except (SystemExit, KeyboardInterrupt):
             raise
         except:
-            data = get_embeddings()
-            for key, value in data.items():
-                data[key] = value.astype(np.float32)
-            compress_pickle.dump(data, fname)
+            raise
+            # data = compute_synset_embeddings()
+            # for key, value in data.items():
+            #     data[key] = value.astype(np.float32)
+            # compress_pickle.dump(data, fname)
 
     return compress_pickle.load(fname)
 
@@ -255,21 +268,22 @@ def format_lemmas(lemmas):
 
 
 if __name__ == "__main__":
-    data = get_embeddings()
-    for key, value in data.items():
-        data[key] = value.astype(np.float32)
-
-    compress_pickle.dump(
-        data,
-        os.path.join(
-            OBJATHOR_DATA_DIR,
-            "synset_definition_embeddings_with_lemmas__2024-01-22.pkl.gz",
-        ),
-    )
-
-    # data = get_embeddings()
-    # data = get_embeddings_single()
-    # local_smoothing(data, "wardrobe.n.01")
-
-    # data = get_lemmas_definition_embeddings()
-    print("DONE")
+    download_embeddings()
+    # data = compute_synset_embeddings()
+    # for key, value in data.items():
+    #     data[key] = value.astype(np.float32)
+    #
+    # compress_pickle.dump(
+    #     data,
+    #     os.path.join(
+    #         OBJATHOR_DATA_DIR,
+    #         "synset_definition_embeddings_with_lemmas__2024-01-22.pkl.gz",
+    #     ),
+    # )
+    #
+    # # data = get_embeddings()
+    # # data = get_embeddings_single()
+    # # local_smoothing(data, "wardrobe.n.01")
+    #
+    # # data = get_lemmas_definition_embeddings()
+    # print("DONE")