diff --git a/features/steps/cli_steps.py b/features/steps/cli_steps.py index 3db8716bd5..22b26a02a8 100644 --- a/features/steps/cli_steps.py +++ b/features/steps/cli_steps.py @@ -151,7 +151,7 @@ def create_config_file(context): context.root_project_dir = context.temp_dir / context.project_name context.package_name = context.project_name.replace("-", "_") config = { - "add_ons": "all", + "add_ons": "1-5", "project_name": context.project_name, "repo_name": context.project_name, "output_dir": str(context.temp_dir), diff --git a/kedro/framework/cli/starters.py b/kedro/framework/cli/starters.py index d61812c103..bdb4f49294 100644 --- a/kedro/framework/cli/starters.py +++ b/kedro/framework/cli/starters.py @@ -108,9 +108,10 @@ class KedroStarterSpec: # noqa: too-few-public-methods 3) Custom Logging: Provides more logging options\n 4) Documentation: Basic documentation setup with Sphinx\n 5) Data Structure: Provides a directory structure for storing data\n +6) Pyspark: Provides a basic PySpark set up\n Example usage:\n -kedro new --addons=lint,test,log,docs,data (or any subset of these options)\n +kedro new --addons=lint,test,log,docs,data,pyspark (or any subset of these options)\n kedro new --addons=all\n kedro new --addons=none """ @@ -121,6 +122,7 @@ class KedroStarterSpec: # noqa: too-few-public-methods "3": "Custom Logging", "4": "Documentation", "5": "Data Structure", + "6": "Pyspark", } # noqa: unused-argument @@ -210,7 +212,7 @@ def _validate_range(start, end): def _validate_selection(add_ons: list[str]): for add_on in add_ons: if int(add_on) < 1 or int(add_on) > len(ADD_ONS_DICT): - message = f"'{add_on}' is not a valid selection.\nPlease select from the available add-ons: 1, 2, 3, 4, 5." # nosec + message = f"'{add_on}' is not a valid selection.\nPlease select from the available add-ons: 1, 2, 3, 4, 5, 6." # nosec click.secho(message, fg="red", err=True) sys.exit(1) @@ -218,6 +220,9 @@ def _validate_selection(add_ons: list[str]): return list(ADD_ONS_DICT) if add_ons_str == "none": return [] + # Guard clause if add_ons_str is None, which can happen if prompts.yml is removed + if not add_ons_str: + return [] # pragma: no cover # Split by comma add_ons_choices = add_ons_str.split(",") @@ -318,7 +323,10 @@ def new(config_path, starter_alias, selected_addons, checkout, directory, **kwar config = _get_addons_from_cli_input(selected_addons, config) cookiecutter_args = _make_cookiecutter_args(config, checkout, directory) - _create_project(template_path, cookiecutter_args) + + project_template = fetch_template_based_on_add_ons(template_path, cookiecutter_args) + + _create_project(project_template, cookiecutter_args) @create_cli.group() @@ -366,7 +374,14 @@ def _get_addons_from_cli_input( Configuration for starting a new project, with the selected add-ons from the `--addons` flag. """ - string_to_number = {"lint": "1", "test": "2", "log": "3", "docs": "4", "data": "5"} + string_to_number = { + "lint": "1", + "test": "2", + "log": "3", + "docs": "4", + "data": "5", + "pyspark": "6", + } if selected_addons is not None: addons = selected_addons.split(",") @@ -393,14 +408,14 @@ def _select_prompts_to_display(prompts_required: dict, selected_addons: str) -> Returns: the prompts_required dictionary, with all the redundant information removed. """ - valid_addons = ["lint", "test", "log", "docs", "data", "all", "none"] + valid_addons = ["lint", "test", "log", "docs", "data", "pyspark", "all", "none"] if selected_addons is not None: addons = re.sub(r"\s", "", selected_addons).split(",") for addon in addons: if addon not in valid_addons: click.secho( - "Please select from the available add-ons: lint, test, log, docs, data, all, none", + "Please select from the available add-ons: lint, test, log, docs, data, pyspark, all, none", fg="red", err=True, ) @@ -493,6 +508,16 @@ def _make_cookiecutter_args( return cookiecutter_args +def fetch_template_based_on_add_ons(template_path, cookiecutter_args: dict[str, Any]): + extra_context = cookiecutter_args["extra_context"] + add_ons = extra_context.get("add_ons") + if add_ons and "Pyspark" in add_ons: + cookiecutter_args["directory"] = "spaceflights-pyspark" + pyspark_path = "git+https://github.com/kedro-org/kedro-starters.git" + return pyspark_path + return template_path + + def _create_project(template_path: str, cookiecutter_args: dict[str, Any]): """Creates a new kedro project using cookiecutter. @@ -524,8 +549,12 @@ def _create_project(template_path: str, cookiecutter_args: dict[str, Any]): ) add_ons = extra_context.get("add_ons") - # Only non-starter projects have configurable add-ons - if template_path == str(TEMPLATE_PATH): + # Only core template and spaceflights-pyspark have configurable add-ons + if ( + template_path == str(TEMPLATE_PATH) + or add_ons is not None + and "Pyspark" in add_ons + ): if add_ons == "[]": # TODO: This should be a list click.secho("\nYou have selected no add-ons") else: diff --git a/kedro/templates/project/hooks/post_gen_project.py b/kedro/templates/project/hooks/post_gen_project.py index 9e800b2249..4475c2f8f1 100644 --- a/kedro/templates/project/hooks/post_gen_project.py +++ b/kedro/templates/project/hooks/post_gen_project.py @@ -5,18 +5,19 @@ setup_template_add_ons, sort_requirements, ) -from kedro.framework.cli.starters import _parse_add_ons_input + def main(): current_dir = Path.cwd() requirements_file_path = current_dir / "requirements.txt" pyproject_file_path = current_dir / "pyproject.toml" + python_package_name = '{{ cookiecutter.python_package }}' # Get the selected add-ons from cookiecutter selected_add_ons = "{{ cookiecutter.add_ons }}" # Handle template directories and requirements according to selected add-ons - setup_template_add_ons(selected_add_ons, requirements_file_path, pyproject_file_path) + setup_template_add_ons(selected_add_ons, requirements_file_path, pyproject_file_path, python_package_name) # Sort requirements.txt file in alphabetical order sort_requirements(requirements_file_path) diff --git a/kedro/templates/project/hooks/utils.py b/kedro/templates/project/hooks/utils.py index 1411d6f8da..00a232bfa0 100644 --- a/kedro/templates/project/hooks/utils.py +++ b/kedro/templates/project/hooks/utils.py @@ -34,6 +34,7 @@ """ docs_pyproject_requirements = """ +[project.optional-dependencies] docs = [ "docutils<0.18.0", "sphinx~=3.4.3", @@ -48,7 +49,7 @@ """ -def setup_template_add_ons(selected_add_ons_list, requirements_file_path, pyproject_file_path): +def setup_template_add_ons(selected_add_ons_list, requirements_file_path, pyproject_file_path, python_package_name): """Removes directories and files related to unwanted addons from a Kedro project template. Adds the necessary requirements for the addons that were selected. @@ -96,6 +97,38 @@ def setup_template_add_ons(selected_add_ons_list, requirements_file_path, pyproj if data_path.exists(): shutil.rmtree(str(data_path)) + if "Pyspark" not in selected_add_ons_list: # If PySpark not selected + pass + else: # Use spaceflights-pyspark to create pyspark template + # Remove all .csv and .xlsx files from data/01_raw/ + raw_data_path = current_dir / "data/01_raw/" + if raw_data_path.exists() and raw_data_path.is_dir(): + for file_path in raw_data_path.glob("*.*"): + if file_path.suffix in [".csv", ".xlsx"]: + file_path.unlink() + + # Remove parameter files from conf/base/ + param_files = [ + "parameters_data_processing.yml", + "parameters_data_science.yml", + ] + conf_base_path = current_dir / "conf/base/" + if conf_base_path.exists() and conf_base_path.is_dir(): + for param_file in param_files: + file_path = conf_base_path / param_file + if file_path.exists(): + file_path.unlink() + + # Remove specific pipeline subdirectories + pipelines_path = current_dir / f"src/{python_package_name}/pipelines/" + for pipeline_subdir in ["data_science", "data_processing"]: + shutil.rmtree(pipelines_path / pipeline_subdir, ignore_errors=True) + + # Remove all test file from tests/pipelines/ + test_pipeline_path = current_dir / "tests/pipelines/test_data_science.py" + if test_pipeline_path.exists(): + test_pipeline_path.unlink() + def sort_requirements(requirements_file_path): """Sort the requirements.txt file in alphabetical order. diff --git a/kedro/templates/project/prompts.yml b/kedro/templates/project/prompts.yml index 2ef3c6c7d2..1cee3def2b 100644 --- a/kedro/templates/project/prompts.yml +++ b/kedro/templates/project/prompts.yml @@ -10,8 +10,9 @@ add_ons: 3) Custom Logging : Provides more logging options 4) Documentation: Provides basic documentations setup with Sphinx 5) Data Structure: Provides a directory structure for storing data + 6) PySpark : Provides a basic PySpark set up - Which add-ons would you like to include in your project? [1-5/1,3/all/none]: + Which add-ons would you like to include in your project? [1-6/1,3/all/none]: regex_validator: "^(all|none|(\\d(,\\d)*|(\\d-\\d)))$" error_message: | Invalid input. Please select valid options for add-ons using comma-separated values, ranges, or 'all/none'. diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/pyproject.toml b/kedro/templates/project/{{ cookiecutter.repo_name }}/pyproject.toml index 98963a7f3f..e4c007e3a4 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/pyproject.toml +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/pyproject.toml @@ -12,19 +12,6 @@ dynamic = ["dependencies", "version"] [project.entry-points."kedro.hooks"] -[project.optional-dependencies] -docs = [ - "docutils<0.18.0", - "sphinx~=3.4.3", - "sphinx_rtd_theme==0.5.1", - "nbsphinx==0.8.1", - "sphinx-autodoc-typehints==1.11.1", - "sphinx_copybutton==0.3.1", - "ipykernel>=5.3, <7.0", - "Jinja2<3.1.0", - "myst-parser~=0.17.2", -] - [tool.setuptools.dynamic] dependencies = {file = "requirements.txt"} version = {attr = "{{ cookiecutter.python_package }}.__version__"} diff --git a/tests/framework/cli/test_starters.py b/tests/framework/cli/test_starters.py index f0268e7e02..a6f99ea3f4 100644 --- a/tests/framework/cli/test_starters.py +++ b/tests/framework/cli/test_starters.py @@ -63,7 +63,14 @@ def _make_cli_prompt_input_without_addons( def _convert_addon_names_to_numbers(selected_addons: str): - string_to_number = {"lint": "1", "test": "2", "log": "3", "docs": "4", "data": "5"} + string_to_number = { + "lint": "1", + "test": "2", + "log": "3", + "docs": "4", + "data": "5", + "pyspark": "6", + } addons = selected_addons.split(",") for i in range(len(addons)): @@ -81,6 +88,7 @@ def _get_expected_files(add_ons: str): "3": 1, "4": 2, "5": 8, + "6": 2, } # files added to template by each add-on add_ons_list = _parse_add_ons_input(add_ons) @@ -269,7 +277,7 @@ def test_starter_list_with_invalid_starter_plugin( ("1,2,3", ["1", "2", "3"]), ("2-4", ["2", "3", "4"]), ("3-3", ["3"]), - ("all", ["1", "2", "3", "4", "5"]), + ("all", ["1", "2", "3", "4", "5", "6"]), ("none", []), ], ) @@ -291,12 +299,12 @@ def test_parse_add_ons_invalid_range(input, capsys): @pytest.mark.parametrize( "input,first_invalid", - [("0,3,5", "0"), ("1,3,6", "6"), ("0-4", "0"), ("3-6", "6")], + [("0,3,5", "0"), ("1,3,7", "7"), ("0-4", "0"), ("3-7", "7")], ) def test_parse_add_ons_invalid_selection(input, first_invalid, capsys): with pytest.raises(SystemExit): _parse_add_ons_input(input) - message = f"'{first_invalid}' is not a valid selection.\nPlease select from the available add-ons: 1, 2, 3, 4, 5." + message = f"'{first_invalid}' is not a valid selection.\nPlease select from the available add-ons: 1, 2, 3, 4, 5, 6." assert message in capsys.readouterr().err @@ -881,7 +889,7 @@ def test_directory_flag_with_starter_alias(self, fake_kedro_cli): class TestAddOnsFromUserPrompts: @pytest.mark.parametrize( "add_ons", - ["1", "2", "3", "4", "5", "none", "2,3,4", "3-5", "all"], + ["1", "2", "3", "4", "5", "6", "none", "2,3,4", "3-5", "all"], ) def test_valid_add_ons(self, fake_kedro_cli, add_ons): result = CliRunner().invoke( @@ -913,7 +921,7 @@ def test_invalid_add_ons(self, fake_kedro_cli): class TestAddOnsFromConfigFile: @pytest.mark.parametrize( "add_ons", - ["1", "2", "3", "4", "5", "none", "2,3,4", "3-5", "all"], + ["1", "2", "3", "4", "5", "6", "none", "2,3,4", "3-5", "all"], ) def test_valid_add_ons(self, fake_kedro_cli, add_ons): """Test project created from config.""" @@ -963,6 +971,7 @@ class TestAddOnsFromCLI: "log", "docs", "data", + "pyspark", "none", "test,log,docs", "test,data,lint", @@ -992,7 +1001,7 @@ def test_invalid_add_ons(self, fake_kedro_cli): assert result.exit_code != 0 assert ( - "Please select from the available add-ons: lint, test, log, docs, data, all, none" + "Please select from the available add-ons: lint, test, log, docs, data, pyspark, all, none" in result.output )