diff --git a/transforms/code/header_cleanser/header_cleanser.ipynb b/transforms/code/header_cleanser/header_cleanser.ipynb new file mode 100644 index 000000000..0ca50ca5f --- /dev/null +++ b/transforms/code/header_cleanser/header_cleanser.ipynb @@ -0,0 +1,284 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Header_cleanser Transform Sample Notebook***" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "\n", + "make venv \\\n", + "source venv/bin/activate \\\n", + "pip install jupyterlab \\\n", + "./python/venv/bin/jupyter lab" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "# Users and application developers must use the right tag for the latest version from pypi\n", + "!pip install scancode-toolkit\n", + "!pip install data-prep-toolkit==0.2.2.dev2\n", + "!pip install 'data-prep-toolkit-transforms[header_cleanser]==0.2.2.dev2'\n", + "!pip install pandas" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configure the transform parameters. \n", + "* Define the transform parameters required for processing. Below are the parameters specific to the Header Cleanser Transform: \n", + "\n", + " * header_cleanser_contents_column_name: Column containing code to cleanse (default: contents).\n", + " * header_cleanser_copyright: Whether to remove copyright headers (default: True).\n", + " * header_cleanser_license: Whether to remove license headers (default: True)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "from header_cleanser_transform_python import HeaderCleanserPythonTransformConfiguration" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Specify input/output folders and parameters***" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Input/output configuration\n", + "local_conf = {\n", + " \"input_folder\": \"path/to/your/input/folder\", # Adjust path for input files\n", + " \"output_folder\": \"path/to/your/output/folder\", # Adjust path for output files\n", + "}\n", + "\n", + "# Parameters for the transform\n", + "params = {\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"header_cleanser_contents_column_name\": \"contents\",\n", + " \"header_cleanser_copyright\": True,\n", + " \"header_cleanser_license\": True\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Invoke the header_cleanser transformation***\n", + "* Launch the transform using the PythonTransformLauncher." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "11:05:41 INFO - pipeline id pipeline_id\n", + "11:05:41 INFO - code location None\n", + "11:05:41 INFO - data factory data_ is using local data access: input_folder - path/to/your/input/folder output_folder - path/to/your/output/folder\n", + "11:05:41 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:05:41 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:05:41 INFO - orchestrator header_cleanser started at 2025-01-09 11:05:41\n", + "11:05:41 ERROR - No input files to process - exiting\n", + "11:05:41 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.argv = ParamsUtils.dict_to_req(d=(params)) \n", + "# create launcher\n", + "launcher = PythonTransformLauncher(HeaderCleanserPythonTransformConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Checking the output Parquet file***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contents
0<?xml version=\"1.0\" encoding=\"UTF-8\"?>\\n<!--\\n...
1/*\\n * Copyright 2018 Makoto Consulting Group,...
2<?xml version=\"1.0\" encoding=\"UTF-8\"?>\\n\\n<!--...
3/*\\n Copyright 2018 Makoto Consulting Group,...
4# Copyright 2016 The TensorFlow Authors. All R...
5<?xml version=\"1.0\" encoding=\"UTF-8\"?>\\n\\n<!--...
6/*\\n * Licensed under the Apache License, Vers...
7#! \\n#\\n# Script to run the DataCreator progra...
8#!/bin/bash\\n\\n###############################...
9# Copyright IBM Corp. and others 2018\\n#\\n# Th...
\n", + "
" + ], + "text/plain": [ + " contents\n", + "0 \\n\\n\\n\\tAIX64\\n\\tAIX64\\n\\tppc\\n\\taix\\n\\tSidecar\\n\\tdesktop (256M + big OS stack)\\n\\t100\\n\\t\\n\\t\\tpaul_church@ca.ibm.com\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\t\\n\\t\\n\\n'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table.to_pandas()['contents'][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Notes for Users and Developers\n", + "1. Ensure that your input files are placed in the specified input_folder path.\n", + " * For sample input files, refer to the python/test-data/input folder.\n", + "2. Use the latest tagged version from PyPI for stability.\n", + "3. Transform parameters can be customized as per requirements. Update params accordingly." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/transforms/code/header_cleanser/python/README.md b/transforms/code/header_cleanser/python/README.md index 405ef9b55..1ed1b920c 100644 --- a/transforms/code/header_cleanser/python/README.md +++ b/transforms/code/header_cleanser/python/README.md @@ -1,14 +1,16 @@ # Header cleanser Please see the set of -[transform project conventions](../../../README.md) +[transform project conventions](../../../README.md#transform-project-conventions) for details on general project conventions, transform configuration, testing and IDE set up. -## Summary +## Contributors -This module is designed to detect and remove license and copyright information from code files. It leverages the [ScanCode Toolkit](https://pypi.org/project/scancode-toolkit/) to accurately identify and process licenses and copyrights in various programming languages. +- Yash Kalathiya (yashkalathiya164@gmail.com) -After locating the position of license or copyright in the input code/sample, this module delete/remove those lines and returns the updated code as parquet file. +## Desciption + +The **Header Cleanser** module is a versatile tool designed to remove license and copyright headers from code files. It supports over 90 programming languages and utilizes the [ScanCode Toolkit](https://scancode-toolkit.readthedocs.io/en/stable/getting-started/install.html) to identify license and copyright information within the codebase. ## Configuration and command line Options @@ -19,7 +21,7 @@ The set of dictionary keys holding configuration for values are as follows: * copyright - write 'true' to remove copyright from input data else 'false'. by default set as 'true'. ## Running -You can run the [header_cleanser_local.py](src/header_cleanser_local.py) (python-only implementation) or [header_cleanser_local_ray.py](ray/src/header_cleanser_local_ray.py) (ray-based implementation) to transform the `test1.parquet` file in [test input data](test-data/input) to an `output` directory. The directory will contain both the new annotated `test1.parquet` file and the `metadata.json` file. +You can run the [header_cleanser_local.py](src/header_cleanser_local.py) (python-only implementation) or [header_cleanser_local_ray.py](../ray/src/header_cleanser_local_ray.py) (ray-based implementation) to transform the `test1.parquet` file in [test input data](test-data/input) to an `output` directory. The directory will contain both the new annotated `test1.parquet` file and the `metadata.json` file. ## Running @@ -28,29 +30,215 @@ When running the transform with the Ray launcher (i.e. TransformLauncher), the following command line arguments are available in addition to the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). * --header_cleanser_contents_column_name - set the contents_column_name configuration key. +* --header_cleanser_document_id_column_name - set the document_id_column_name configuration key. * --header_cleanser_license - set the license configuration key. * --header_cleanser_copyright - set the copyright configuration key. +* --header_cleanser_n_processes - set the n_processes configuration key. +* --header_cleanser_tmp_dir - set the tmp_dir configuration key. +* --header_cleanser_timeout - set the timeout configuration key. +* --header_cleanser_skip_timeout - set the skip_timeout configuration key. + +## Input and Output + +### Input +- **File Format**: Parquet file containing code. +- **Input Column**: The code should be in a column named `content`. +- **Sample Input**: + [Sample Input File](./test-data/input/test1.parquet) + +### Output +- **File Format**: Parquet file with the updated code in the same column. +- **Sample Output**: + [Sample Output File](./test-data/expected/license-and-copyright-local/test1.parquet) + +### CLI Syntax +When invoking the CLI, use the following syntax for these parameters: +``` +--header_cleanser_ +``` +For example: +``` +--header_cleanser_content_column_name='content' +``` + +## Example + +### Sample Input Code: +```java +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.jstevenperry.intro; + +import java.util.logging.Logger; + +// This is the main public class representing a Person +public class Person { + private static final Logger logger = Logger.getLogger(Person.class.getName()); + + private String name; + private int age; + private int height; + private int weight; + private String eyeColor; + private String gender; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public int getAge() { + return age; + } + + public void setAge(int age) { + this.age = age; + } + + public int getHeight() { + return height; + } + + public void setHeight(int height) { + this.height = height; + } + + public int getWeight() { + return weight; + } + + public void setWeight(int weight) { + this.weight = weight; + } + + public String getEyeColor() { + return eyeColor; + } -### Running the samples -To run the samples, use the following `make` targets + public void setEyeColor(String eyeColor) { + this.eyeColor = eyeColor; + } -* `run-cli-sample` - runs src/header_cleanser_transform_python.py using command line args -* `run-local-python-sample` - runs src/header_cleanser_local_python.py -* `run-local-sample` - runs src/header_cleanser_local.py + public String getGender() { + return gender; + } -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. + public void setGender(String gender) { + this.gender = gender; + } -For example, -```shell -make run-cli-sample -... + public Person(String name, int age, int height, int weight, String eyeColor, String gender) { + super(); + this.name = name; + this.age = age; + this.height = height; + this.weight = weight; + this.eyeColor = eyeColor; + this.gender = gender; + + logger.info("Created Person object with name '" + getName() + "'"); + } +} ``` -Then -```shell -ls output + +### Sample Output (with default parameters): +```java +package com.jstevenperry.intro; + +import java.util.logging.Logger; + +/// This is the main public class representing a Person +public class Person { + + private static final Logger logger = Logger.getLogger(Person.class.getName()); + + private String name; + private int age; + private int height; + private int weight; + private String eyeColor; + private String gender; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public int getAge() { + return age; + } + + public void setAge(int age) { + this.age = age; + } + + public int getHeight() { + return height; + } + + public void setHeight(int height) { + this.height = height; + } + + public int getWeight() { + return weight; + } + + public void setWeight(int weight) { + this.weight = weight; + } + + public String getEyeColor() { + return eyeColor; + } + + public void setEyeColor(String eyeColor) { + this.eyeColor = eyeColor; + } + + public String getGender() { + return gender; + } + + public void setGender(String gender) { + this.gender = gender; + } + + public Person(String name, int age, int height, int weight, String eyeColor, String gender) { + super(); + this.name = name; + this.age = age; + this.height = height; + this.weight = weight; + this.eyeColor = eyeColor; + this.gender = gender; + + logger.info("Created Person object with name '" + getName() + "'"); + } +} ``` -To see results of the transform. + +## Sample Notebook + +Check out the [example notebook](../header_cleanser.ipynb) for further details. + ### Transforming data using the transform image