Reduce data size to use

for speed, storage cost and shard size of index
GoogleCloudPlatform · Apr 5, 2023 · 0b1e6e6 · 0b1e6e6
1 parent 0037de1
commit 0b1e6e6
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 19 deletions.
diff --git a/TUTORIAL.ja.md b/TUTORIAL.ja.md
@@ -125,11 +125,12 @@ gcloud beta run jobs create \
   --image "us-central1-docker.pkg.dev/<walkthrough-project-id />/vectorizer/vectorizer:v1"  \
   --cpu 4 \
   --memory 2Gi \
-  --parallelism 5 \
+  --parallelism 2 \
   --region us-central1 \
   --service-account "vectorizer@<walkthrough-project-id />.iam.gserviceaccount.com" \
-  --tasks 5 \
-  --set-env-vars="DESTINATION_ROOT=gs://<walkthrough-project-id />-flowers/embeddings" \
+  --tasks 2 \
+  --set-env-vars "DESTINATION_ROOT=gs://<walkthrough-project-id />-flowers/embeddings" \
+  --set-env-vars "^@^FLOWERS=daisy,roses" \
   --execute-now
 ```
 

diff --git a/TUTORIAL.md b/TUTORIAL.md
@@ -125,11 +125,12 @@ gcloud beta run jobs create \
   --image "us-central1-docker.pkg.dev/<walkthrough-project-id />/vectorizer/vectorizer:v1"  \
   --cpu 4 \
   --memory 2Gi \
-  --parallelism 5 \
+  --parallelism 2 \
   --region us-central1 \
   --service-account "vectorizer@<walkthrough-project-id />.iam.gserviceaccount.com" \
-  --tasks 5 \
-  --set-env-vars="DESTINATION_ROOT=gs://<walkthrough-project-id />-flowers/embeddings" \
+  --tasks 2 \
+  --set-env-vars "DESTINATION_ROOT=gs://<walkthrough-project-id />-flowers/embeddings" \
+  --set-env-vars "^@^FLOWERS=daisy,roses" \
   --execute-now
 ```
 

diff --git a/vectorizer/main.py b/vectorizer/main.py
@@ -28,8 +28,6 @@ class SampleDataVectorizer:
     BUCKET = "cloud-samples-data"
     PREFIX = "ai-platform/flowers/"
 
-    FLOWERS = ["daisy", "dandelion", "roses", "sunflowers", "tulips"]
-
     def __init__(self, flower: str, destination: str):
         self._flower = flower
         self._client = storage.Client()
@@ -80,16 +78,8 @@ def _vectorize(self, raw: tf.Tensor) -> list[float]:
         return self._model.predict(np.array([image.numpy()]))[0].tolist()
 
 
-def main(destination_root: str, task_index: int) -> None:
-    flower = SampleDataVectorizer.FLOWERS[task_index]
-
-    dir = "flowers"
-
-    if task_index == len(SampleDataVectorizer.FLOWERS) - 1:
-        # For updating indices
-        dir = flower
-
-    destination = os.path.join(destination_root, dir)
+def main(destination_root: str, flower: str) -> None:
+    destination = os.path.join(destination_root, "flowers")
 
     vectorizer = SampleDataVectorizer(flower, destination)
     vectorizer.vectorize_and_upload()
@@ -101,5 +91,6 @@ def main(destination_root: str, task_index: int) -> None:
     logging.basicConfig(level=logging.INFO)
     # e.g. gs://my-bucket/index01/embeddings
     destination = os.environ["DESTINATION_ROOT"]
+    flowers = os.environ["FLOWERS"].split(",")
     task_index = int(os.environ.get("CLOUD_RUN_TASK_INDEX", "0"))
-    main(destination, task_index)
+    main(destination, flowers[task_index])