Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

create helper function #26225

Open
wants to merge 50 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
b513dd6
create helper function
smeet07 Apr 11, 2023
ae106c6
formatting
smeet07 Apr 11, 2023
8a0a130
formatting again
smeet07 Apr 11, 2023
5eee7a2
re assigning it back to the feature
smeet07 Apr 12, 2023
dc5a6c2
trailing whitespace
smeet07 Apr 12, 2023
809516c
whitespace changes
smeet07 Apr 12, 2023
3551952
changes
smeet07 Apr 12, 2023
cf4c32a
replacing whitespace by tabs
smeet07 Apr 12, 2023
e1a7798
Update criteo.py
smeet07 Apr 12, 2023
f59d992
sparse_to_dense syntax has been changed in tf2.0
smeet07 Apr 14, 2023
60acb95
write unit test to ensure fill_in_missing
smeet07 May 1, 2023
86e27d5
indentation
smeet07 May 1, 2023
80e9dbb
Create criteo_test.py
smeet07 May 11, 2023
5aa8380
Update criteo.py
smeet07 May 11, 2023
3a4ccaa
add license
smeet07 May 11, 2023
0c1c74d
import statements
smeet07 May 11, 2023
aedea5e
skip unit test
smeet07 May 17, 2023
eaf77e1
Update criteo_test.py
smeet07 May 17, 2023
4e9156d
Update criteo_test.py
smeet07 May 17, 2023
2868789
Update criteo_test.py
smeet07 May 18, 2023
09beedb
Update criteo_test.py
smeet07 May 18, 2023
04a236a
skipif syntax changes
smeet07 May 18, 2023
f948b4d
Update criteo_test.py
smeet07 May 18, 2023
cd243c2
Update criteo.py
smeet07 May 19, 2023
63eb2b4
absolute import
smeet07 May 30, 2023
a1f2452
whitespace changes
smeet07 Jun 25, 2023
147d385
linter changes
smeet07 Jun 25, 2023
b636e08
indentation
smeet07 Jun 25, 2023
40933c7
linter
smeet07 Jun 25, 2023
8f8d129
spacing
smeet07 Jun 25, 2023
2c1d191
Update criteo.py
smeet07 Jun 25, 2023
220ea01
Update criteo_test.py
smeet07 Jun 25, 2023
9762926
Update criteo.py
smeet07 Jun 25, 2023
04632ce
Update criteo_test.py
smeet07 Jun 25, 2023
c973320
Update criteo.py
smeet07 Jun 25, 2023
94bc2e7
fix import issue
smeet07 Jul 5, 2023
ccebff8
indentation
smeet07 Jul 5, 2023
bc378f0
spacing issues
smeet07 Jul 5, 2023
742bf3a
lint issues
smeet07 Jul 5, 2023
cf9ec23
add space
smeet07 Jul 7, 2023
e8dee56
add
smeet07 Jul 7, 2023
051172e
fix call
smeet07 Jul 8, 2023
24093cc
Update criteo_test.py
smeet07 Jul 8, 2023
2ea9726
Update criteo_test.py
smeet07 Jul 9, 2023
8832162
Update criteo_test.py
smeet07 Jul 11, 2023
5680b56
Update sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/…
smeet07 Jul 18, 2023
8bb2324
remove try block
smeet07 Jul 18, 2023
0079765
remove double except block
smeet07 Jul 18, 2023
6da2669
remove imports and type assignment
smeet07 Jul 18, 2023
5326583
remove trailing whitespace
smeet07 Jul 18, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,18 @@ def make_input_feature_spec(include_label=True):
return result


def fill_in_missing(feature, default_value=-1):
feature = tf.sparse.SparseTensor(
indices=feature.indices,
values=feature.values,
dense_shape=[feature.dense_shape[0], 1])
feature = tf.sparse.to_dense(feature, default_value=default_value)
# Reshaping from a batch of vectors of size 1 to a batch of
# scalar and adding a bucketized version.
feature = tf.squeeze(feature, axis=1)
return feature


def make_preprocessing_fn(frequency_threshold):
"""Creates a preprocessing function for criteo.

Expand All @@ -132,15 +144,7 @@ def preprocessing_fn(inputs):
result = {'clicked': inputs['clicked']}
for name in _INTEGER_COLUMN_NAMES:
feature = inputs[name]
# TODO(https://github.com/apache/beam/issues/24902):
# Replace this boilerplate with a helper function.
# This is a SparseTensor because it is optional. Here we fill in a
# default value when it is missing.
feature = tft.sparse_tensor_to_dense_with_shape(
feature, [None, 1], default_value=-1)
# Reshaping from a batch of vectors of size 1 to a batch of scalars and
# adding a bucketized version.
feature = tf.squeeze(feature, axis=1)
feature = fill_in_missing(feature)
result[name] = feature
result[name + '_bucketized'] = tft.bucketize(feature, _NUM_BUCKETS)
for name in _CATEGORICAL_COLUMN_NAMES:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import unittest

import numpy as np
import pytest

try:
import tensorflow_transform as tft
import tensorflow as tf
smeet07 marked this conversation as resolved.
Show resolved Hide resolved
from apache_beam.testing.benchmarks.cloudml.criteo_tft.criteo import fill_in_missing
except ImportError:
tft = None

if not tft:
raise unittest.SkipTest('tensorflow_transform is not installed.')


@pytest.mark.uses_tft
smeet07 marked this conversation as resolved.
Show resolved Hide resolved
@unittest.skipIf(tft is None or tf is None, 'Missing dependencies. ')
class FillInMissingTest(unittest.TestCase):
def test_fill_in_missing(self):
# Create a rank 2 sparse tensor with missing values
indices = np.array([[0, 0], [0, 2], [1, 1], [2, 0]])
values = np.array([1, 2, 3, 4])
dense_shape = np.array([3, 3])
sparse_tensor = tf.sparse.SparseTensor(indices, values, dense_shape)

# Fill in missing values with -1
filled_tensor = tf.Tensor()
if fill_in_missing is not None:
filled_tensor = fill_in_missing(sparse_tensor, -1)

# Convert to a dense tensor and check the values
expected_output = np.array([1, -1, 2, -1, -1, -1, 4, -1, -1])
actual_output = filled_tensor.numpy()
self.assertEqual(expected_output, actual_output)


if __name__ == '__main__':
unittest.main()
Loading