Skip to content

Commit

Permalink
Merge pull request #1003 from IBM/fdedup-fixes
Browse files Browse the repository at this point in the history
Fdedup package versioning and windows fixes
  • Loading branch information
touma-I authored Jan 31, 2025
2 parents 6262d77 + f827bb6 commit 00dc04e
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 5 deletions.
1 change: 1 addition & 0 deletions transforms/README-list.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Note: This list includes the transforms that were part of the release starting w

### 1.0.1.dev1
Added Gneissweb transforms
fdedup fix for windows
### 1.0.1.dev0
PR #979 (code_profiler)
### 1.0.0.a6
Expand Down
2 changes: 1 addition & 1 deletion transforms/requirements-ray.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
data-prep-toolkit[ray]>=0.2.3
data-prep-toolkit[ray]>=0.2.4.dev0
networkx==3.3
colorlog==6.8.2
func-timeout==4.3.5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,14 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str
band = int(match.group(1))
segment = int(match.group(2))
else:
raise ValueError(f"Wrong folder_name {folder_name}, should be band=b/segment=s")
match = re.match(r"^band=(\d+)\\segment=(\d+)$", folder_name)
if match:
band = int(match.group(1))
segment = int(match.group(2))
else:
raise ValueError(
f"Wrong folder_name {folder_name}, should be either band=b/segment=s or band=b\\segment=s (windows)"
)
output_folder = TransformUtils.clean_path(self.data_access.output_folder)
output_path = os.path.join(output_folder, f"band_{band}_segment_{segment}.parquet")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ def _emit_bands(self, int_id_column: str, minhashes: np.array, b: int, r: int, s
results = []
for band_index in range(b):
band_hash, _ = mmh3.hash64(
minhashes[band_index * r : (band_index + 1) * r],
minhashes[band_index * r : (band_index + 1) * r].tobytes(),
seed=seed,
signed=False,
)
Expand Down
4 changes: 2 additions & 2 deletions transforms/universal/fdedup/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
pyyaml>=6.0.2
boto3>=1.34.69
kubernetes>=30.1.0
polars==1.9.0
polars>=1.9.0, !=1.10.0, !=1.11.0, !=1.12.0
disjoint-set>=0.8.0
scipy>=1.12.1, <2.0.0
numpy<1.29.0
sentencepiece>=0.2.0
mmh3>=4.1.0, <=5.0.1
mmh3>=4.1.0

0 comments on commit 00dc04e

Please sign in to comment.