Skip to content

Commit

Permalink
FIX: Tensorization script no longer worked for pngs where there is no… (
Browse files Browse the repository at this point in the history
#566)

* FIX: Tensorization script no longer worked for pngs where there is no ZIP_FOLDER, and could not handle it if the ZIP_FOLDER was not the first python arg

* FIX: Be more explicit with tensorization modes
  • Loading branch information
daniellepace authored May 7, 2024
1 parent 1cd3932 commit 3e8b662
Showing 1 changed file with 35 additions and 13 deletions.
48 changes: 35 additions & 13 deletions scripts/tensorize.sh
Original file line number Diff line number Diff line change
Expand Up @@ -119,25 +119,47 @@ START_TIME=$(date +%s)
MIN_SAMPLE_ID=$SAMPLE_IDS_START
MAX_SAMPLE_ID=$SAMPLE_IDS_END

# We want to get the zip folder that was passes to recipes.py - look for the --zip_folder argument and extract the value passed after that
ZIP_FOLDER=$(echo ${PYTHON_ARGS} | sed 's/--zip_folder \([^ ]*\).*/\1/')
if [ ! -e $ZIP_FOLDER ]; then
echo "ERROR: Zip folder passed was not valid, found $ZIP_FOLDER but expected folder path." 1>&2
exit 1
fi

# create a directory in the /tmp/ folder to store some utilities for use later
mkdir -p /tmp/ml4h
# Write out a file with the ids of every sample in the input folder
echo "Gathering list of input zips to process between $MIN_SAMPLE_ID and $MAX_SAMPLE_ID, this takes several seconds..."
find $ZIP_FOLDER -name '*.zip' | xargs -I {} basename {} | cut -d '_' -f 1 \
| awk -v min="$MIN_SAMPLE_ID" -v max="$MAX_SAMPLE_ID" '$1 > min && $1 < max' \
| sort | uniq > /tmp/ml4h/sample_ids_trimmed.txt


if [[ "$TENSORIZE_MODE" == "tensorize" ]]; then
# We want to get the zip folder that was passes to recipes.py - look for the --zip_folder argument and extract the value passed after that
ZIP_FOLDER=$(echo ${PYTHON_ARGS} | sed -n 's/.*--zip_folder \([^ ]*\).*/\1/p')

# Write out a file with the ids of every sample in the input folder
if [ -e "$ZIP_FOLDER" ]; then
echo "Gathering list of input zips to process between $MIN_SAMPLE_ID and $MAX_SAMPLE_ID, this takes several seconds..."
find $ZIP_FOLDER -name '*.zip' | xargs -I {} basename {} | cut -d '_' -f 1 \
| awk -v min="$MIN_SAMPLE_ID" -v max="$MAX_SAMPLE_ID" '$1 > min && $1 < max' \
| sort | uniq > /tmp/ml4h/sample_ids_trimmed.txt
else
echo "ERROR: Invalid zip folder, found zip folder $ZIP_FOLDER"
exit 1
fi

elif [[ "$TENSORIZE_MODE" == "tensorize_pngs" ]]; then
# If tensorizing pngs, we can look for sample_ids as the first column of the manifest file in the --app_csv argument
MANIFEST=$(echo ${PYTHON_ARGS} | sed -n 's/.*--app_csv \([^ ]*\).*/\1/p')

# Write out a file with the ids of every sample in the manifest file
if [ -e "$MANIFEST" ]; then
echo "Gathering list of sample ids to process between $MIN_SAMPLE_ID and $MAX_SAMPLE_ID, this takes several seconds..."
cat $MANIFEST | awk '{print $1}' | grep -v sample \
| awk -v min="$MIN_SAMPLE_ID" -v max="$MAX_SAMPLE_ID" '$1 > min && $1 < max' \
| sort | uniq > /tmp/ml4h/sample_ids_trimmed.txt
else
echo "ERROR: Invalid manifest, found manifest $MANIFEST"
exit 1
fi
else
echo "ERROR: The tensorize mode $TENSORIZE_MODE is not supported"
exit 1
fi

NUM_SAMPLES_TO_PROCESS=$(cat /tmp/ml4h/sample_ids_trimmed.txt | wc -l)
echo "Including $NUM_SAMPLES_TO_PROCESS samples in this tensorization job."


echo -e "\nLaunching job for sample IDs starting with $MIN_SAMPLE_ID and ending with $MAX_SAMPLE_ID via:"

# we need to run a command using xargs in parallel, and it gets rather complex and messy unless we can just run
Expand Down

0 comments on commit 3e8b662

Please sign in to comment.