From 58ee493c1d9bd046840e82fcf62e01f4bdea28e9 Mon Sep 17 00:00:00 2001 From: Yash kalathiya Date: Sun, 1 Dec 2024 16:17:45 +0530 Subject: [PATCH 1/3] Updated readme and added notebook Signed-off-by: Yash kalathiya --- transforms/code/header_cleanser/README.md | 234 +++++++++++++++- .../notebooks/header_cleanser.ipynb | 251 ++++++++++++++++++ 2 files changed, 472 insertions(+), 13 deletions(-) create mode 100644 transforms/code/header_cleanser/notebooks/header_cleanser.ipynb diff --git a/transforms/code/header_cleanser/README.md b/transforms/code/header_cleanser/README.md index 5f9dea6b1..34b4f0094 100644 --- a/transforms/code/header_cleanser/README.md +++ b/transforms/code/header_cleanser/README.md @@ -1,13 +1,221 @@ -# Header Cleanser Transform -The Header cleanser transforms -Detect and remove license and copyright of input data. -Per the set of -[transform project conventions](../../README.md#transform-project-conventions) -the following runtimes are available: - -* [python](python/README.md) - provides the base python-based transformation -implementation. -* [ray](ray/README.md) - enables the running of the base python transformation -in a Ray runtime. -* [kfp_ray](kfp_ray/README.md) - enables running the ray docker image -in a kubernetes cluster using a generated `yaml` file. \ No newline at end of file +# Header Cleanser Transform + +The **Header Cleanser** module is a versatile tool designed to remove license and copyright headers from code files. It supports over 90 programming languages and utilizes the [ScanCode Toolkit](https://scancode-toolkit.readthedocs.io/en/stable/getting-started/install.html) to identify license and copyright information within the codebase. + +## Input and Output + +### Input +- **File Format**: Parquet file containing code. +- **Input Column**: The code should be in a column named `content`. +- **Sample Input**: + [Sample Input File](transforms/code/header_cleanser/python/test-data/input/test1.parquet) + +### Output +- **File Format**: Parquet file with the updated code in the same column. +- **Sample Output**: + [Sample Output File](transforms/code/header_cleanser/python/test-data/expected/license-and-copyright-local/test1.parquet) + +## Parameters + +The following parameters can be adjusted to control the behavior of the extraction: + +| Parameter Name | Default Value | Description | +|---------------------------|---------------|---------------------------------------------------------------------| +| `content_column_name` | `contents` | Specifies the column name that holds the code to be processed. | +| `copyright` | `true` | Set to `true` to remove copyright information from the code. | +| `license` | `true` | Set to `true` to remove license information from the code. | + +### CLI Syntax +When invoking the CLI, use the following syntax for these parameters: +``` +--header_cleanser_ +``` +For example: +``` +--header_cleanser_content_column_name='content' +``` + +## Example + +### Sample Input Code: +```java +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.jstevenperry.intro; + +import java.util.logging.Logger; + +// This is the main public class representing a Person +public class Person { + private static final Logger logger = Logger.getLogger(Person.class.getName()); + + private String name; + private int age; + private int height; + private int weight; + private String eyeColor; + private String gender; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public int getAge() { + return age; + } + + public void setAge(int age) { + this.age = age; + } + + public int getHeight() { + return height; + } + + public void setHeight(int height) { + this.height = height; + } + + public int getWeight() { + return weight; + } + + public void setWeight(int weight) { + this.weight = weight; + } + + public String getEyeColor() { + return eyeColor; + } + + public void setEyeColor(String eyeColor) { + this.eyeColor = eyeColor; + } + + public String getGender() { + return gender; + } + + public void setGender(String gender) { + this.gender = gender; + } + + public Person(String name, int age, int height, int weight, String eyeColor, String gender) { + super(); + this.name = name; + this.age = age; + this.height = height; + this.weight = weight; + this.eyeColor = eyeColor; + this.gender = gender; + + logger.info("Created Person object with name '" + getName() + "'"); + } +} +``` + +### Sample Output (with default parameters): +```java +package com.jstevenperry.intro; + +import java.util.logging.Logger; + +/// This is the main public class representing a Person +public class Person { + + private static final Logger logger = Logger.getLogger(Person.class.getName()); + + private String name; + private int age; + private int height; + private int weight; + private String eyeColor; + private String gender; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public int getAge() { + return age; + } + + public void setAge(int age) { + this.age = age; + } + + public int getHeight() { + return height; + } + + public void setHeight(int height) { + this.height = height; + } + + public int getWeight() { + return weight; + } + + public void setWeight(int weight) { + this.weight = weight; + } + + public String getEyeColor() { + return eyeColor; + } + + public void setEyeColor(String eyeColor) { + this.eyeColor = eyeColor; + } + + public String getGender() { + return gender; + } + + public void setGender(String gender) { + this.gender = gender; + } + + public Person(String name, int age, int height, int weight, String eyeColor, String gender) { + super(); + this.name = name; + this.age = age; + this.height = height; + this.weight = weight; + this.eyeColor = eyeColor; + this.gender = gender; + + logger.info("Created Person object with name '" + getName() + "'"); + } +} +``` + +## Different Runtimes + +- **[Python](python/README.md)**: Provides the base Python-based transformation implementation. +- **[Ray](ray/README.md)**: Enables running the base Python transformation in a Ray runtime. +- **[KFP Ray](kfp_ray/README.md)**: Enables running the Ray Docker image in a Kubernetes cluster using a generated YAML file. + +## Sample Notebook + +Check out the [example notebook](notebooks/header_cleanser.ipynb) for further details. + diff --git a/transforms/code/header_cleanser/notebooks/header_cleanser.ipynb b/transforms/code/header_cleanser/notebooks/header_cleanser.ipynb new file mode 100644 index 000000000..8d6a248c7 --- /dev/null +++ b/transforms/code/header_cleanser/notebooks/header_cleanser.ipynb @@ -0,0 +1,251 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Header_cleanser Transform Sample Notebook***" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install data-prep-toolkit==0.2.2.dev2\n", + "!pip install 'data-prep-toolkit-transforms[header_cleanser]==0.2.2.dev2'\n", + "!pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "from header_cleanser_transform_python import HeaderCleanserPythonTransformConfiguration" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Specify input/output folders and parameters***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "\n", + "# create parameters\n", + "local_conf = {\n", + " \"input_folder\": \"path/to/your/input/folder\", # For the sample input files, refer to the 'python/test-data/input' folder\n", + " \"output_folder\": \"path/to/your/output/folder\",\n", + "}\n", + "\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"header_cleanser_contents_column_name\": \"contents\", #give your column name which contains code for header cleansing\n", + " \"header_cleanser_copyright\": True, #set true to remove copyright\n", + " \"header_cleanser_license\": True #set true to remove license\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Invoke the html2parquet transformation***" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "08:15:27 INFO - pipeline id pipeline_id\n", + "08:15:27 INFO - code location None\n", + "08:15:27 INFO - data factory data_ is using local data access: input_folder - input output_folder - output\n", + "08:15:27 INFO - data factory data_ max_files -1, n_sample -1\n", + "08:15:27 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "08:15:27 INFO - orchestrator header_cleanser started at 2024-12-01 08:15:27\n", + "08:15:27 INFO - Number of files is 1, source profile {'max_file_size': 0.016656875610351562, 'min_file_size': 0.016656875610351562, 'total_file_size': 0.016656875610351562}\n", + "08:15:31 INFO - Completed 1 files (100.0%) in 0.056 min\n", + "08:15:31 INFO - Done processing 1 files, waiting for flush() completion.\n", + "08:15:31 INFO - done flushing in 0.0 sec\n", + "08:15:31 INFO - Completed execution in 0.057 min, execution result 0\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.argv = ParamsUtils.dict_to_req(d=(params)) \n", + "# create launcher\n", + "launcher = PythonTransformLauncher(HeaderCleanserPythonTransformConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Checking the output Parquet file***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contents
0<?xml version=\"1.0\" encoding=\"UTF-8\"?>\\n<spec ...
1*/\\n\\n/**\\n * Declare some internal variables...
2<?xml version=\"1.0\" encoding=\"UTF-8\"?>\\n\\n\\n<f...
3// Example 1 - Console Transport\\n// require W...
4\"\"\"Functions for downloading and reading MNIST...
5<?xml version=\"1.0\" encoding=\"UTF-8\"?>\\n\\n\\n<b...
6package com.jstevenperry.intro;\\n\\nimport java...
7#! \\n#\\n# Script to run the DataCreator progra...
8#!/bin/bash\\n\\n\\n# Exit immediately if any une...
9# This file installs package dependencies for ...
\n", + "
" + ], + "text/plain": [ + " contents\n", + "0 \\n\\n\\n\\n\\n\\n\\n\\n\\n\\tAIX64\\n\\tAIX64\\n\\tppc\\n\\taix\\n\\tSidecar\\n\\tdesktop (256M + big OS stack)\\n\\t100\\n\\t\\n\\t\\tpaul_church@ca.ibm.com\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\n'" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table.to_pandas()['contents'][0]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 9bdc7ddfd0d96cccc85e0ab95e3cc5b37ccd7d60 Mon Sep 17 00:00:00 2001 From: Yash kalathiya Date: Sat, 7 Dec 2024 13:00:21 +0530 Subject: [PATCH 2/3] updated readme and directory structure Signed-off-by: Yash kalathiya --- transforms/code/header_cleanser/README.md | 234 +-------------- .../header_cleanser/header_cleanser.ipynb | 282 ++++++++++++++++++ .../notebooks/header_cleanser.ipynb | 251 ---------------- .../code/header_cleanser/python/README.md | 224 ++++++++++++-- 4 files changed, 501 insertions(+), 490 deletions(-) create mode 100644 transforms/code/header_cleanser/header_cleanser.ipynb delete mode 100644 transforms/code/header_cleanser/notebooks/header_cleanser.ipynb diff --git a/transforms/code/header_cleanser/README.md b/transforms/code/header_cleanser/README.md index 34b4f0094..5f9dea6b1 100644 --- a/transforms/code/header_cleanser/README.md +++ b/transforms/code/header_cleanser/README.md @@ -1,221 +1,13 @@ -# Header Cleanser Transform - -The **Header Cleanser** module is a versatile tool designed to remove license and copyright headers from code files. It supports over 90 programming languages and utilizes the [ScanCode Toolkit](https://scancode-toolkit.readthedocs.io/en/stable/getting-started/install.html) to identify license and copyright information within the codebase. - -## Input and Output - -### Input -- **File Format**: Parquet file containing code. -- **Input Column**: The code should be in a column named `content`. -- **Sample Input**: - [Sample Input File](transforms/code/header_cleanser/python/test-data/input/test1.parquet) - -### Output -- **File Format**: Parquet file with the updated code in the same column. -- **Sample Output**: - [Sample Output File](transforms/code/header_cleanser/python/test-data/expected/license-and-copyright-local/test1.parquet) - -## Parameters - -The following parameters can be adjusted to control the behavior of the extraction: - -| Parameter Name | Default Value | Description | -|---------------------------|---------------|---------------------------------------------------------------------| -| `content_column_name` | `contents` | Specifies the column name that holds the code to be processed. | -| `copyright` | `true` | Set to `true` to remove copyright information from the code. | -| `license` | `true` | Set to `true` to remove license information from the code. | - -### CLI Syntax -When invoking the CLI, use the following syntax for these parameters: -``` ---header_cleanser_ -``` -For example: -``` ---header_cleanser_content_column_name='content' -``` - -## Example - -### Sample Input Code: -```java -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.jstevenperry.intro; - -import java.util.logging.Logger; - -// This is the main public class representing a Person -public class Person { - private static final Logger logger = Logger.getLogger(Person.class.getName()); - - private String name; - private int age; - private int height; - private int weight; - private String eyeColor; - private String gender; - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public int getAge() { - return age; - } - - public void setAge(int age) { - this.age = age; - } - - public int getHeight() { - return height; - } - - public void setHeight(int height) { - this.height = height; - } - - public int getWeight() { - return weight; - } - - public void setWeight(int weight) { - this.weight = weight; - } - - public String getEyeColor() { - return eyeColor; - } - - public void setEyeColor(String eyeColor) { - this.eyeColor = eyeColor; - } - - public String getGender() { - return gender; - } - - public void setGender(String gender) { - this.gender = gender; - } - - public Person(String name, int age, int height, int weight, String eyeColor, String gender) { - super(); - this.name = name; - this.age = age; - this.height = height; - this.weight = weight; - this.eyeColor = eyeColor; - this.gender = gender; - - logger.info("Created Person object with name '" + getName() + "'"); - } -} -``` - -### Sample Output (with default parameters): -```java -package com.jstevenperry.intro; - -import java.util.logging.Logger; - -/// This is the main public class representing a Person -public class Person { - - private static final Logger logger = Logger.getLogger(Person.class.getName()); - - private String name; - private int age; - private int height; - private int weight; - private String eyeColor; - private String gender; - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public int getAge() { - return age; - } - - public void setAge(int age) { - this.age = age; - } - - public int getHeight() { - return height; - } - - public void setHeight(int height) { - this.height = height; - } - - public int getWeight() { - return weight; - } - - public void setWeight(int weight) { - this.weight = weight; - } - - public String getEyeColor() { - return eyeColor; - } - - public void setEyeColor(String eyeColor) { - this.eyeColor = eyeColor; - } - - public String getGender() { - return gender; - } - - public void setGender(String gender) { - this.gender = gender; - } - - public Person(String name, int age, int height, int weight, String eyeColor, String gender) { - super(); - this.name = name; - this.age = age; - this.height = height; - this.weight = weight; - this.eyeColor = eyeColor; - this.gender = gender; - - logger.info("Created Person object with name '" + getName() + "'"); - } -} -``` - -## Different Runtimes - -- **[Python](python/README.md)**: Provides the base Python-based transformation implementation. -- **[Ray](ray/README.md)**: Enables running the base Python transformation in a Ray runtime. -- **[KFP Ray](kfp_ray/README.md)**: Enables running the Ray Docker image in a Kubernetes cluster using a generated YAML file. - -## Sample Notebook - -Check out the [example notebook](notebooks/header_cleanser.ipynb) for further details. - +# Header Cleanser Transform +The Header cleanser transforms +Detect and remove license and copyright of input data. +Per the set of +[transform project conventions](../../README.md#transform-project-conventions) +the following runtimes are available: + +* [python](python/README.md) - provides the base python-based transformation +implementation. +* [ray](ray/README.md) - enables the running of the base python transformation +in a Ray runtime. +* [kfp_ray](kfp_ray/README.md) - enables running the ray docker image +in a kubernetes cluster using a generated `yaml` file. \ No newline at end of file diff --git a/transforms/code/header_cleanser/header_cleanser.ipynb b/transforms/code/header_cleanser/header_cleanser.ipynb new file mode 100644 index 000000000..0a56c186d --- /dev/null +++ b/transforms/code/header_cleanser/header_cleanser.ipynb @@ -0,0 +1,282 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Header_cleanser Transform Sample Notebook***" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "\n", + "make venv \\\n", + "source venv/bin/activate \\\n", + "pip install jupyterlab " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "# Users and application developers must use the right tag for the latest version from pypi\n", + "!pip install data-prep-toolkit==0.2.2.dev2\n", + "!pip install 'data-prep-toolkit-transforms[header_cleanser]==0.2.2.dev2'\n", + "!pip install pandas" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configure the transform parameters. \n", + "* Define the transform parameters required for processing. Below are the parameters specific to the Header Cleanser Transform: \n", + "\n", + " * header_cleanser_contents_column_name: Column containing code to cleanse (default: contents).\n", + " * header_cleanser_copyright: Whether to remove copyright headers (default: True).\n", + " * header_cleanser_license: Whether to remove license headers (default: True)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "from header_cleanser_transform_python import HeaderCleanserPythonTransformConfiguration" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Specify input/output folders and parameters***" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Input/output configuration\n", + "local_conf = {\n", + " \"input_folder\": \"path/to/your/input/folder\", # Adjust path for input files\n", + " \"output_folder\": \"path/to/your/output/folder\", # Adjust path for output files\n", + "}\n", + "\n", + "# Parameters for the transform\n", + "params = {\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"header_cleanser_contents_column_name\": \"contents\",\n", + " \"header_cleanser_copyright\": True,\n", + " \"header_cleanser_license\": True\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Invoke the header_cleanser transformation***\n", + "* Launch the transform using the PythonTransformLauncher." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "11:05:41 INFO - pipeline id pipeline_id\n", + "11:05:41 INFO - code location None\n", + "11:05:41 INFO - data factory data_ is using local data access: input_folder - path/to/your/input/folder output_folder - path/to/your/output/folder\n", + "11:05:41 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:05:41 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:05:41 INFO - orchestrator header_cleanser started at 2025-01-09 11:05:41\n", + "11:05:41 ERROR - No input files to process - exiting\n", + "11:05:41 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.argv = ParamsUtils.dict_to_req(d=(params)) \n", + "# create launcher\n", + "launcher = PythonTransformLauncher(HeaderCleanserPythonTransformConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Checking the output Parquet file***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contents
0<?xml version=\"1.0\" encoding=\"UTF-8\"?>\\n<!--\\n...
1/*\\n * Copyright 2018 Makoto Consulting Group,...
2<?xml version=\"1.0\" encoding=\"UTF-8\"?>\\n\\n<!--...
3/*\\n Copyright 2018 Makoto Consulting Group,...
4# Copyright 2016 The TensorFlow Authors. All R...
5<?xml version=\"1.0\" encoding=\"UTF-8\"?>\\n\\n<!--...
6/*\\n * Licensed under the Apache License, Vers...
7#! \\n#\\n# Script to run the DataCreator progra...
8#!/bin/bash\\n\\n###############################...
9# Copyright IBM Corp. and others 2018\\n#\\n# Th...
\n", + "
" + ], + "text/plain": [ + " contents\n", + "0 \\n\\n\\n\\tAIX64\\n\\tAIX64\\n\\tppc\\n\\taix\\n\\tSidecar\\n\\tdesktop (256M + big OS stack)\\n\\t100\\n\\t\\n\\t\\tpaul_church@ca.ibm.com\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\n'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table.to_pandas()['contents'][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Notes for Users and Developers\n", + "1. Ensure that your input files are placed in the specified input_folder path.\n", + " * For sample input files, refer to the python/test-data/input folder.\n", + "2. Use the latest tagged version from PyPI for stability.\n", + "3. Transform parameters can be customized as per requirements. Update params accordingly." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/transforms/code/header_cleanser/notebooks/header_cleanser.ipynb b/transforms/code/header_cleanser/notebooks/header_cleanser.ipynb deleted file mode 100644 index 8d6a248c7..000000000 --- a/transforms/code/header_cleanser/notebooks/header_cleanser.ipynb +++ /dev/null @@ -1,251 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "***Header_cleanser Transform Sample Notebook***" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "%%capture\n", - "!pip install data-prep-toolkit==0.2.2.dev2\n", - "!pip install 'data-prep-toolkit-transforms[header_cleanser]==0.2.2.dev2'\n", - "!pip install pandas" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from data_processing.runtime.pure_python import PythonTransformLauncher\n", - "from data_processing.utils import ParamsUtils\n", - "from header_cleanser_transform_python import HeaderCleanserPythonTransformConfiguration" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "***Specify input/output folders and parameters***" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import ast\n", - "\n", - "# create parameters\n", - "local_conf = {\n", - " \"input_folder\": \"path/to/your/input/folder\", # For the sample input files, refer to the 'python/test-data/input' folder\n", - " \"output_folder\": \"path/to/your/output/folder\",\n", - "}\n", - "\n", - "params = {\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " \"header_cleanser_contents_column_name\": \"contents\", #give your column name which contains code for header cleansing\n", - " \"header_cleanser_copyright\": True, #set true to remove copyright\n", - " \"header_cleanser_license\": True #set true to remove license\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "***Invoke the html2parquet transformation***" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "08:15:27 INFO - pipeline id pipeline_id\n", - "08:15:27 INFO - code location None\n", - "08:15:27 INFO - data factory data_ is using local data access: input_folder - input output_folder - output\n", - "08:15:27 INFO - data factory data_ max_files -1, n_sample -1\n", - "08:15:27 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "08:15:27 INFO - orchestrator header_cleanser started at 2024-12-01 08:15:27\n", - "08:15:27 INFO - Number of files is 1, source profile {'max_file_size': 0.016656875610351562, 'min_file_size': 0.016656875610351562, 'total_file_size': 0.016656875610351562}\n", - "08:15:31 INFO - Completed 1 files (100.0%) in 0.056 min\n", - "08:15:31 INFO - Done processing 1 files, waiting for flush() completion.\n", - "08:15:31 INFO - done flushing in 0.0 sec\n", - "08:15:31 INFO - Completed execution in 0.057 min, execution result 0\n" - ] - } - ], - "source": [ - "import sys\n", - "sys.argv = ParamsUtils.dict_to_req(d=(params)) \n", - "# create launcher\n", - "launcher = PythonTransformLauncher(HeaderCleanserPythonTransformConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "***Checking the output Parquet file***" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
contents
0<?xml version=\"1.0\" encoding=\"UTF-8\"?>\\n<spec ...
1*/\\n\\n/**\\n * Declare some internal variables...
2<?xml version=\"1.0\" encoding=\"UTF-8\"?>\\n\\n\\n<f...
3// Example 1 - Console Transport\\n// require W...
4\"\"\"Functions for downloading and reading MNIST...
5<?xml version=\"1.0\" encoding=\"UTF-8\"?>\\n\\n\\n<b...
6package com.jstevenperry.intro;\\n\\nimport java...
7#! \\n#\\n# Script to run the DataCreator progra...
8#!/bin/bash\\n\\n\\n# Exit immediately if any une...
9# This file installs package dependencies for ...
\n", - "
" - ], - "text/plain": [ - " contents\n", - "0 \\n\\n\\n\\n\\n\\n\\n\\n\\n\\tAIX64\\n\\tAIX64\\n\\tppc\\n\\taix\\n\\tSidecar\\n\\tdesktop (256M + big OS stack)\\n\\t100\\n\\t\\n\\t\\tpaul_church@ca.ibm.com\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\n'" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.to_pandas()['contents'][0]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/transforms/code/header_cleanser/python/README.md b/transforms/code/header_cleanser/python/README.md index 405ef9b55..0210d3669 100644 --- a/transforms/code/header_cleanser/python/README.md +++ b/transforms/code/header_cleanser/python/README.md @@ -4,11 +4,13 @@ Please see the set of for details on general project conventions, transform configuration, testing and IDE set up. -## Summary +## Contributors -This module is designed to detect and remove license and copyright information from code files. It leverages the [ScanCode Toolkit](https://pypi.org/project/scancode-toolkit/) to accurately identify and process licenses and copyrights in various programming languages. +- Yash Kalathiya (yashkalathiya164@gmail.com) -After locating the position of license or copyright in the input code/sample, this module delete/remove those lines and returns the updated code as parquet file. +## Desciption + +The **Header Cleanser** module is a versatile tool designed to remove license and copyright headers from code files. It supports over 90 programming languages and utilizes the [ScanCode Toolkit](https://scancode-toolkit.readthedocs.io/en/stable/getting-started/install.html) to identify license and copyright information within the codebase. ## Configuration and command line Options @@ -28,29 +30,215 @@ When running the transform with the Ray launcher (i.e. TransformLauncher), the following command line arguments are available in addition to the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). * --header_cleanser_contents_column_name - set the contents_column_name configuration key. +* --header_cleanser_document_id_column_name - set the document_id_column_name configuration key. * --header_cleanser_license - set the license configuration key. * --header_cleanser_copyright - set the copyright configuration key. +* --header_cleanser_n_processes - set the n_processes configuration key. +* --header_cleanser_tmp_dir - set the tmp_dir configuration key. +* --header_cleanser_timeout - set the timeout configuration key. +* --header_cleanser_skip_timeout - set the skip_timeout configuration key. + +## Input and Output + +### Input +- **File Format**: Parquet file containing code. +- **Input Column**: The code should be in a column named `content`. +- **Sample Input**: + [Sample Input File](transforms/code/header_cleanser/python/test-data/input/test1.parquet) + +### Output +- **File Format**: Parquet file with the updated code in the same column. +- **Sample Output**: + [Sample Output File](transforms/code/header_cleanser/python/test-data/expected/license-and-copyright-local/test1.parquet) + +### CLI Syntax +When invoking the CLI, use the following syntax for these parameters: +``` +--header_cleanser_ +``` +For example: +``` +--header_cleanser_content_column_name='content' +``` + +## Example + +### Sample Input Code: +```java +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.jstevenperry.intro; + +import java.util.logging.Logger; + +// This is the main public class representing a Person +public class Person { + private static final Logger logger = Logger.getLogger(Person.class.getName()); + + private String name; + private int age; + private int height; + private int weight; + private String eyeColor; + private String gender; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public int getAge() { + return age; + } + + public void setAge(int age) { + this.age = age; + } + + public int getHeight() { + return height; + } + + public void setHeight(int height) { + this.height = height; + } + + public int getWeight() { + return weight; + } + + public void setWeight(int weight) { + this.weight = weight; + } + + public String getEyeColor() { + return eyeColor; + } -### Running the samples -To run the samples, use the following `make` targets + public void setEyeColor(String eyeColor) { + this.eyeColor = eyeColor; + } -* `run-cli-sample` - runs src/header_cleanser_transform_python.py using command line args -* `run-local-python-sample` - runs src/header_cleanser_local_python.py -* `run-local-sample` - runs src/header_cleanser_local.py + public String getGender() { + return gender; + } -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. + public void setGender(String gender) { + this.gender = gender; + } -For example, -```shell -make run-cli-sample -... + public Person(String name, int age, int height, int weight, String eyeColor, String gender) { + super(); + this.name = name; + this.age = age; + this.height = height; + this.weight = weight; + this.eyeColor = eyeColor; + this.gender = gender; + + logger.info("Created Person object with name '" + getName() + "'"); + } +} ``` -Then -```shell -ls output + +### Sample Output (with default parameters): +```java +package com.jstevenperry.intro; + +import java.util.logging.Logger; + +/// This is the main public class representing a Person +public class Person { + + private static final Logger logger = Logger.getLogger(Person.class.getName()); + + private String name; + private int age; + private int height; + private int weight; + private String eyeColor; + private String gender; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public int getAge() { + return age; + } + + public void setAge(int age) { + this.age = age; + } + + public int getHeight() { + return height; + } + + public void setHeight(int height) { + this.height = height; + } + + public int getWeight() { + return weight; + } + + public void setWeight(int weight) { + this.weight = weight; + } + + public String getEyeColor() { + return eyeColor; + } + + public void setEyeColor(String eyeColor) { + this.eyeColor = eyeColor; + } + + public String getGender() { + return gender; + } + + public void setGender(String gender) { + this.gender = gender; + } + + public Person(String name, int age, int height, int weight, String eyeColor, String gender) { + super(); + this.name = name; + this.age = age; + this.height = height; + this.weight = weight; + this.eyeColor = eyeColor; + this.gender = gender; + + logger.info("Created Person object with name '" + getName() + "'"); + } +} ``` -To see results of the transform. + +## Sample Notebook + +Check out the [example notebook](notebooks/header_cleanser.ipynb) for further details. + ### Transforming data using the transform image From 79ee4e304aad403bd4988cfc264b4d17217eb35d Mon Sep 17 00:00:00 2001 From: SHAHROKH DAIJAVAD Date: Thu, 9 Jan 2025 10:56:50 -0800 Subject: [PATCH 3/3] Fixed README for broken links and the notebook Signed-off-by: SHAHROKH DAIJAVAD --- transforms/code/header_cleanser/header_cleanser.ipynb | 4 +++- transforms/code/header_cleanser/python/README.md | 10 +++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/transforms/code/header_cleanser/header_cleanser.ipynb b/transforms/code/header_cleanser/header_cleanser.ipynb index 0a56c186d..0ca50ca5f 100644 --- a/transforms/code/header_cleanser/header_cleanser.ipynb +++ b/transforms/code/header_cleanser/header_cleanser.ipynb @@ -15,7 +15,8 @@ "\n", "make venv \\\n", "source venv/bin/activate \\\n", - "pip install jupyterlab " + "pip install jupyterlab \\\n", + "./python/venv/bin/jupyter lab" ] }, { @@ -26,6 +27,7 @@ "source": [ "%%capture\n", "# Users and application developers must use the right tag for the latest version from pypi\n", + "!pip install scancode-toolkit\n", "!pip install data-prep-toolkit==0.2.2.dev2\n", "!pip install 'data-prep-toolkit-transforms[header_cleanser]==0.2.2.dev2'\n", "!pip install pandas" diff --git a/transforms/code/header_cleanser/python/README.md b/transforms/code/header_cleanser/python/README.md index 0210d3669..1ed1b920c 100644 --- a/transforms/code/header_cleanser/python/README.md +++ b/transforms/code/header_cleanser/python/README.md @@ -1,6 +1,6 @@ # Header cleanser Please see the set of -[transform project conventions](../../../README.md) +[transform project conventions](../../../README.md#transform-project-conventions) for details on general project conventions, transform configuration, testing and IDE set up. @@ -21,7 +21,7 @@ The set of dictionary keys holding configuration for values are as follows: * copyright - write 'true' to remove copyright from input data else 'false'. by default set as 'true'. ## Running -You can run the [header_cleanser_local.py](src/header_cleanser_local.py) (python-only implementation) or [header_cleanser_local_ray.py](ray/src/header_cleanser_local_ray.py) (ray-based implementation) to transform the `test1.parquet` file in [test input data](test-data/input) to an `output` directory. The directory will contain both the new annotated `test1.parquet` file and the `metadata.json` file. +You can run the [header_cleanser_local.py](src/header_cleanser_local.py) (python-only implementation) or [header_cleanser_local_ray.py](../ray/src/header_cleanser_local_ray.py) (ray-based implementation) to transform the `test1.parquet` file in [test input data](test-data/input) to an `output` directory. The directory will contain both the new annotated `test1.parquet` file and the `metadata.json` file. ## Running @@ -44,12 +44,12 @@ the [python launcher](../../../../data-processing-lib/doc/python-launcher-option - **File Format**: Parquet file containing code. - **Input Column**: The code should be in a column named `content`. - **Sample Input**: - [Sample Input File](transforms/code/header_cleanser/python/test-data/input/test1.parquet) + [Sample Input File](./test-data/input/test1.parquet) ### Output - **File Format**: Parquet file with the updated code in the same column. - **Sample Output**: - [Sample Output File](transforms/code/header_cleanser/python/test-data/expected/license-and-copyright-local/test1.parquet) + [Sample Output File](./test-data/expected/license-and-copyright-local/test1.parquet) ### CLI Syntax When invoking the CLI, use the following syntax for these parameters: @@ -237,7 +237,7 @@ public class Person { ## Sample Notebook -Check out the [example notebook](notebooks/header_cleanser.ipynb) for further details. +Check out the [example notebook](../header_cleanser.ipynb) for further details. ### Transforming data using the transform image