Skip to content

Commit

Permalink
Reduce data size to use
Browse files Browse the repository at this point in the history
for speed, storage cost and shard size of index
  • Loading branch information
nownabe committed Apr 5, 2023
1 parent 0037de1 commit 0b1e6e6
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 19 deletions.
7 changes: 4 additions & 3 deletions TUTORIAL.ja.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,12 @@ gcloud beta run jobs create \
--image "us-central1-docker.pkg.dev/<walkthrough-project-id />/vectorizer/vectorizer:v1" \
--cpu 4 \
--memory 2Gi \
--parallelism 5 \
--parallelism 2 \
--region us-central1 \
--service-account "vectorizer@<walkthrough-project-id />.iam.gserviceaccount.com" \
--tasks 5 \
--set-env-vars="DESTINATION_ROOT=gs://<walkthrough-project-id />-flowers/embeddings" \
--tasks 2 \
--set-env-vars "DESTINATION_ROOT=gs://<walkthrough-project-id />-flowers/embeddings" \
--set-env-vars "^@^FLOWERS=daisy,roses" \
--execute-now
```

Expand Down
7 changes: 4 additions & 3 deletions TUTORIAL.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,12 @@ gcloud beta run jobs create \
--image "us-central1-docker.pkg.dev/<walkthrough-project-id />/vectorizer/vectorizer:v1" \
--cpu 4 \
--memory 2Gi \
--parallelism 5 \
--parallelism 2 \
--region us-central1 \
--service-account "vectorizer@<walkthrough-project-id />.iam.gserviceaccount.com" \
--tasks 5 \
--set-env-vars="DESTINATION_ROOT=gs://<walkthrough-project-id />-flowers/embeddings" \
--tasks 2 \
--set-env-vars "DESTINATION_ROOT=gs://<walkthrough-project-id />-flowers/embeddings" \
--set-env-vars "^@^FLOWERS=daisy,roses" \
--execute-now
```

Expand Down
17 changes: 4 additions & 13 deletions vectorizer/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ class SampleDataVectorizer:
BUCKET = "cloud-samples-data"
PREFIX = "ai-platform/flowers/"

FLOWERS = ["daisy", "dandelion", "roses", "sunflowers", "tulips"]

def __init__(self, flower: str, destination: str):
self._flower = flower
self._client = storage.Client()
Expand Down Expand Up @@ -80,16 +78,8 @@ def _vectorize(self, raw: tf.Tensor) -> list[float]:
return self._model.predict(np.array([image.numpy()]))[0].tolist()


def main(destination_root: str, task_index: int) -> None:
flower = SampleDataVectorizer.FLOWERS[task_index]

dir = "flowers"

if task_index == len(SampleDataVectorizer.FLOWERS) - 1:
# For updating indices
dir = flower

destination = os.path.join(destination_root, dir)
def main(destination_root: str, flower: str) -> None:
destination = os.path.join(destination_root, "flowers")

vectorizer = SampleDataVectorizer(flower, destination)
vectorizer.vectorize_and_upload()
Expand All @@ -101,5 +91,6 @@ def main(destination_root: str, task_index: int) -> None:
logging.basicConfig(level=logging.INFO)
# e.g. gs://my-bucket/index01/embeddings
destination = os.environ["DESTINATION_ROOT"]
flowers = os.environ["FLOWERS"].split(",")
task_index = int(os.environ.get("CLOUD_RUN_TASK_INDEX", "0"))
main(destination, task_index)
main(destination, flowers[task_index])

0 comments on commit 0b1e6e6

Please sign in to comment.