From fc51df839c80dd1ede4d44e6ff4f9e511378e75d Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 6 Jan 2025 22:01:23 +0530 Subject: [PATCH 01/10] Ignore scanning large data files Ignore scanning large data files which are larger than 1 MB to avoid crashing scans on memory spikes. Also rollback https://github.com/aboutcode-org/scancode.io/pull/1504 Reference: https://github.com/aboutcode-org/scancode-toolkit/issues/3711 Signed-off-by: Ayan Sinha Mahapatra --- scanpipe/pipes/flag.py | 1 + scanpipe/pipes/scancode.py | 29 +++++++++++++++++++---------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/scanpipe/pipes/flag.py b/scanpipe/pipes/flag.py index 89a99ef85..784ac3f97 100644 --- a/scanpipe/pipes/flag.py +++ b/scanpipe/pipes/flag.py @@ -43,6 +43,7 @@ IGNORED_DEFAULT_IGNORES = "ignored-default-ignores" IGNORED_DATA_FILE_NO_CLUES = "ignored-data-file-no-clues" IGNORED_DOC_FILE = "ignored-doc-file" +IGNORED_LARGE_DATA_FILE = "ignored-large-data-file" COMPLIANCE_LICENSES = "compliance-licenses" COMPLIANCE_SOURCEMIRROR = "compliance-sourcemirror" diff --git a/scanpipe/pipes/scancode.py b/scanpipe/pipes/scancode.py index 611f2d3b4..9dbea6d52 100644 --- a/scanpipe/pipes/scancode.py +++ b/scanpipe/pipes/scancode.py @@ -34,6 +34,7 @@ from django.apps import apps from django.conf import settings from django.db.models import ObjectDoesNotExist +from django.db.models import Q from commoncode import fileutils from commoncode.resource import VirtualCodebase @@ -58,6 +59,13 @@ Utilities to deal with ScanCode toolkit features and objects. """ +# skip large data files which are causing memory spikes +# See https://github.com/aboutcode-org/scancode-toolkit/issues/3711 +# TODO: this has to be removed once the issue is fixed +SKIP_DATA_FILE_EXTENSIONS = [".json", ".xml", ".yaml", ".yml"] +SKIP_DATA_FILE_SIZE = 1048576 + + scanpipe_app = apps.get_app_config("scanpipe") @@ -310,9 +318,17 @@ def scan_resources( if not scan_func_kwargs: scan_func_kwargs = {} - resource_count = resource_qs.count() + # Skip scannning data files larger than 1 MB + if not scan_func == scan_for_package_data: + resource_qs.filter( + Q(extension__in=SKIP_DATA_FILE_EXTENSIONS) + & Q(size__gte=SKIP_DATA_FILE_SIZE) + ).update(status=flag.IGNORED_LARGE_DATA_FILE) + scan_resource_qs = resource_qs.filter(~Q(status=flag.IGNORED_LARGE_DATA_FILE)) + + resource_count = scan_resource_qs.count() logger.info(f"Scan {resource_count} codebase resources with {scan_func.__name__}") - resource_iterator = resource_qs.iterator(chunk_size=2000) + resource_iterator = scan_resource_qs.iterator(chunk_size=2000) progress = LoopProgress(resource_count, logger=progress_logger) max_workers = get_max_workers(keep_available=1) @@ -350,14 +366,7 @@ def scan_resources( "Please ensure that there is at least 2 GB of available memory per " "CPU core for successful execution." ) - - resource.project.add_error( - exception=broken_pool_error, - model="scan_resources", - description=message, - object_instance=resource, - ) - continue + raise broken_pool_error from InsufficientResourcesError(message) save_func(resource, scan_results, scan_errors) From 6d6a6a1891fd88363b5a23e0a926acc13ebb332c Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 6 Jan 2025 23:57:53 +0530 Subject: [PATCH 02/10] Bump scancode-toolkit to version v32.3.1 Also remove platform constraints from rust-inspector and go-inspector. Signed-off-by: Ayan Sinha Mahapatra --- setup.cfg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 751a619b2..69bb0ff76 100644 --- a/setup.cfg +++ b/setup.cfg @@ -72,7 +72,7 @@ install_requires = # Docker container-inspector==33.0.0 # ScanCode-toolkit - scancode-toolkit[packages]==32.3.0 + scancode-toolkit[packages]==32.3.1 extractcode[full]==31.0.0 commoncode==32.1.0 packageurl-python==0.16.0 @@ -80,8 +80,8 @@ install_requires = fetchcode-container==1.2.3.210512; sys_platform == "linux" # Inspectors elf-inspector==0.0.1 - go-inspector==0.5.0; sys_platform == "linux" - rust-inspector==0.1.0; sys_platform == "linux" + go-inspector==0.5.0 + rust-inspector==0.1.0 python-inspector==0.12.1 source-inspector==0.5.1; sys_platform != "darwin" and platform_machine != "arm64" aboutcode-toolkit==11.0.0 From 0e005800b4a255e05a6debdb98de1e90c4aa18b8 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 7 Jan 2025 00:36:56 +0530 Subject: [PATCH 03/10] Increase size limit to skip scanning data file Signed-off-by: Ayan Sinha Mahapatra --- scanpipe/pipes/scancode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scanpipe/pipes/scancode.py b/scanpipe/pipes/scancode.py index 9dbea6d52..531f37390 100644 --- a/scanpipe/pipes/scancode.py +++ b/scanpipe/pipes/scancode.py @@ -63,7 +63,7 @@ # See https://github.com/aboutcode-org/scancode-toolkit/issues/3711 # TODO: this has to be removed once the issue is fixed SKIP_DATA_FILE_EXTENSIONS = [".json", ".xml", ".yaml", ".yml"] -SKIP_DATA_FILE_SIZE = 1048576 +SKIP_DATA_FILE_SIZE = 5242880 scanpipe_app = apps.get_app_config("scanpipe") @@ -318,7 +318,7 @@ def scan_resources( if not scan_func_kwargs: scan_func_kwargs = {} - # Skip scannning data files larger than 1 MB + # Skip scannning data files larger than 5 MB if not scan_func == scan_for_package_data: resource_qs.filter( Q(extension__in=SKIP_DATA_FILE_EXTENSIONS) From 89d3ee465f4c9b4a19cfa1d040a5c64bcd5f9602 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 7 Jan 2025 23:01:56 +0530 Subject: [PATCH 04/10] Add a scancodeio setting SCANCODEIO_SCAN_MAX_FILE_SIZE Signed-off-by: Ayan Sinha Mahapatra --- docs/application-settings.rst | 12 ++++++++++++ scancodeio/settings.py | 3 +++ scanpipe/pipes/flag.py | 18 +++++++++++++++++- scanpipe/pipes/scancode.py | 22 +++++++--------------- scanpipe/tests/pipes/test_scancode.py | 19 +++++++++++++++++++ 5 files changed, 58 insertions(+), 16 deletions(-) diff --git a/docs/application-settings.rst b/docs/application-settings.rst index 56d939bec..17528a736 100644 --- a/docs/application-settings.rst +++ b/docs/application-settings.rst @@ -165,6 +165,18 @@ The value unit is second and is defined as an integer:: Default: ``120`` (2 minutes) +SCANCODEIO_SCAN_MAX_FILE_SIZE +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Maximum file size allowed for a file to be scanned when scanning a codebase. + +The value unit is bytes and is defined as an integer, see the following +example of setting this at 5 MB:: + + SCANCODEIO_SCAN_MAX_FILE_SIZE=5242880 + +Default: ``None`` (all files will be scanned) + .. _scancodeio_settings_pipelines_dirs: SCANCODEIO_PIPELINES_DIRS diff --git a/scancodeio/settings.py b/scancodeio/settings.py index 517e8e613..7d09c9fa2 100644 --- a/scancodeio/settings.py +++ b/scancodeio/settings.py @@ -100,6 +100,9 @@ # Default to 2 minutes. SCANCODEIO_SCAN_FILE_TIMEOUT = env.int("SCANCODEIO_SCAN_FILE_TIMEOUT", default=120) +# Default to None which scans all files +SCANCODEIO_SCAN_MAX_FILE_SIZE = env.int("SCANCODEIO_SCAN_MAX_FILE_SIZE", default=None) + # List views pagination, controls the number of items displayed per page. # Syntax in .env: SCANCODEIO_PAGINATE_BY=project=10,project_error=10 SCANCODEIO_PAGINATE_BY = env.dict( diff --git a/scanpipe/pipes/flag.py b/scanpipe/pipes/flag.py index 784ac3f97..231398069 100644 --- a/scanpipe/pipes/flag.py +++ b/scanpipe/pipes/flag.py @@ -20,6 +20,8 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +from django.db.models import Q + NO_STATUS = "" SCANNED = "scanned" @@ -43,7 +45,7 @@ IGNORED_DEFAULT_IGNORES = "ignored-default-ignores" IGNORED_DATA_FILE_NO_CLUES = "ignored-data-file-no-clues" IGNORED_DOC_FILE = "ignored-doc-file" -IGNORED_LARGE_DATA_FILE = "ignored-large-data-file" +IGNORED_BY_MAX_FILE_SIZE = "ignored-by-max-file-size" COMPLIANCE_LICENSES = "compliance-licenses" COMPLIANCE_SOURCEMIRROR = "compliance-sourcemirror" @@ -103,6 +105,20 @@ def flag_ignored_patterns(project, patterns): return update_count +def flag_and_ignore_files_over_max_size(resource_qs, file_size_limit): + """ + Flag codebase resources which are over the max file size for scanning + and return all other files within the file size limit. + """ + if not file_size_limit: + return resource_qs + + resource_qs.filter(size__gte=file_size_limit).update( + status=IGNORED_BY_MAX_FILE_SIZE + ) + return resource_qs.filter(~Q(status=IGNORED_BY_MAX_FILE_SIZE)) + + def analyze_scanned_files(project): """Set the status for CodebaseResource to unknown or no license.""" scanned_files = project.codebaseresources.files().status(SCANNED) diff --git a/scanpipe/pipes/scancode.py b/scanpipe/pipes/scancode.py index 531f37390..596e69541 100644 --- a/scanpipe/pipes/scancode.py +++ b/scanpipe/pipes/scancode.py @@ -34,7 +34,6 @@ from django.apps import apps from django.conf import settings from django.db.models import ObjectDoesNotExist -from django.db.models import Q from commoncode import fileutils from commoncode.resource import VirtualCodebase @@ -59,12 +58,6 @@ Utilities to deal with ScanCode toolkit features and objects. """ -# skip large data files which are causing memory spikes -# See https://github.com/aboutcode-org/scancode-toolkit/issues/3711 -# TODO: this has to be removed once the issue is fixed -SKIP_DATA_FILE_EXTENSIONS = [".json", ".xml", ".yaml", ".yml"] -SKIP_DATA_FILE_SIZE = 5242880 - scanpipe_app = apps.get_app_config("scanpipe") @@ -318,17 +311,16 @@ def scan_resources( if not scan_func_kwargs: scan_func_kwargs = {} - # Skip scannning data files larger than 5 MB + # Skip scannning files larger than the specified max size if not scan_func == scan_for_package_data: - resource_qs.filter( - Q(extension__in=SKIP_DATA_FILE_EXTENSIONS) - & Q(size__gte=SKIP_DATA_FILE_SIZE) - ).update(status=flag.IGNORED_LARGE_DATA_FILE) - scan_resource_qs = resource_qs.filter(~Q(status=flag.IGNORED_LARGE_DATA_FILE)) + resource_qs = flag.flag_and_ignore_files_over_max_size( + resource_qs=resource_qs, + file_size_limit=settings.SCANCODEIO_SCAN_MAX_FILE_SIZE, + ) - resource_count = scan_resource_qs.count() + resource_count = resource_qs.count() logger.info(f"Scan {resource_count} codebase resources with {scan_func.__name__}") - resource_iterator = scan_resource_qs.iterator(chunk_size=2000) + resource_iterator = resource_qs.iterator(chunk_size=2000) progress = LoopProgress(resource_count, logger=progress_logger) max_workers = get_max_workers(keep_available=1) diff --git a/scanpipe/tests/pipes/test_scancode.py b/scanpipe/tests/pipes/test_scancode.py index 775e80bac..afbe0d07e 100644 --- a/scanpipe/tests/pipes/test_scancode.py +++ b/scanpipe/tests/pipes/test_scancode.py @@ -42,6 +42,7 @@ from scanpipe.models import DiscoveredPackage from scanpipe.models import Project from scanpipe.pipes import collect_and_create_codebase_resources +from scanpipe.pipes import flag from scanpipe.pipes import input from scanpipe.pipes import scancode from scanpipe.pipes.input import copy_input @@ -445,6 +446,24 @@ def test_scanpipe_pipes_scancode_run_scan_args(self, mock_run_scan): run_scan_kwargs = mock_run_scan.call_args.kwargs self.assertEqual(expected_processes, run_scan_kwargs.get("processes")) + def test_scanpipe_max_file_size_works(self): + with override_settings(SCANCODEIO_SCAN_MAX_FILE_SIZE=10000): + project1 = Project.objects.create(name="Analysis") + input_location = self.data / "d2d-rust" / "to-trustier-binary-linux.tar.gz" + project1.copy_input_from(input_location) + + run = project1.add_pipeline("scan_codebase") + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + self.assertEqual( + project1.codebaseresources.get( + path="to-trustier-binary-linux.tar.gz-extract/trustier" + ).status, + flag.IGNORED_BY_MAX_FILE_SIZE, + ) + def test_scanpipe_pipes_scancode_make_results_summary(self, regen=FIXTURES_REGEN): # Ensure the policies index is empty to avoid any side effect on results scanpipe_app.license_policies_index = None From 0ebe84b25bb40413dbe02a99b00e6b2b20c88b2e Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 7 Jan 2025 23:09:56 +0530 Subject: [PATCH 05/10] Use scancode with conda bugfix Signed-off-by: Ayan Sinha Mahapatra --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 69bb0ff76..5e2920f6e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -72,7 +72,7 @@ install_requires = # Docker container-inspector==33.0.0 # ScanCode-toolkit - scancode-toolkit[packages]==32.3.1 + scancode-toolkit[packages] @ git+https://github.com/nexB/scancode-toolkit.git@c9c023e7712044475578d03814f34bb03435d949 extractcode[full]==31.0.0 commoncode==32.1.0 packageurl-python==0.16.0 From ba93ac24c9557827d2852a58e7a7b1e2e4a7840c Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 7 Jan 2025 23:37:28 +0530 Subject: [PATCH 06/10] Address feedback Signed-off-by: Ayan Sinha Mahapatra --- scanpipe/pipes/flag.py | 4 +--- scanpipe/pipes/scancode.py | 9 ++++++--- scanpipe/tests/pipes/test_scancode.py | 8 +++----- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/scanpipe/pipes/flag.py b/scanpipe/pipes/flag.py index 231398069..814ba8d7a 100644 --- a/scanpipe/pipes/flag.py +++ b/scanpipe/pipes/flag.py @@ -20,7 +20,6 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. -from django.db.models import Q NO_STATUS = "" @@ -113,10 +112,9 @@ def flag_and_ignore_files_over_max_size(resource_qs, file_size_limit): if not file_size_limit: return resource_qs - resource_qs.filter(size__gte=file_size_limit).update( + return resource_qs.filter(size__gte=file_size_limit).update( status=IGNORED_BY_MAX_FILE_SIZE ) - return resource_qs.filter(~Q(status=IGNORED_BY_MAX_FILE_SIZE)) def analyze_scanned_files(project): diff --git a/scanpipe/pipes/scancode.py b/scanpipe/pipes/scancode.py index 596e69541..246e80b46 100644 --- a/scanpipe/pipes/scancode.py +++ b/scanpipe/pipes/scancode.py @@ -34,6 +34,7 @@ from django.apps import apps from django.conf import settings from django.db.models import ObjectDoesNotExist +from django.db.models import Q from commoncode import fileutils from commoncode.resource import VirtualCodebase @@ -313,14 +314,16 @@ def scan_resources( # Skip scannning files larger than the specified max size if not scan_func == scan_for_package_data: - resource_qs = flag.flag_and_ignore_files_over_max_size( + flag.flag_and_ignore_files_over_max_size( resource_qs=resource_qs, file_size_limit=settings.SCANCODEIO_SCAN_MAX_FILE_SIZE, ) - resource_count = resource_qs.count() + scan_resource_qs = resource_qs.filter(~Q(status=flag.IGNORED_BY_MAX_FILE_SIZE)) + + resource_count = scan_resource_qs.count() logger.info(f"Scan {resource_count} codebase resources with {scan_func.__name__}") - resource_iterator = resource_qs.iterator(chunk_size=2000) + resource_iterator = scan_resource_qs.iterator(chunk_size=2000) progress = LoopProgress(resource_count, logger=progress_logger) max_workers = get_max_workers(keep_available=1) diff --git a/scanpipe/tests/pipes/test_scancode.py b/scanpipe/tests/pipes/test_scancode.py index afbe0d07e..e15667443 100644 --- a/scanpipe/tests/pipes/test_scancode.py +++ b/scanpipe/tests/pipes/test_scancode.py @@ -457,12 +457,10 @@ def test_scanpipe_max_file_size_works(self): exitcode, out = pipeline.execute() self.assertEqual(0, exitcode, msg=out) - self.assertEqual( - project1.codebaseresources.get( - path="to-trustier-binary-linux.tar.gz-extract/trustier" - ).status, - flag.IGNORED_BY_MAX_FILE_SIZE, + resource1 = project1.codebaseresources.get( + path="to-trustier-binary-linux.tar.gz-extract/trustier" ) + self.assertEqual(resource1.status, flag.IGNORED_BY_MAX_FILE_SIZE) def test_scanpipe_pipes_scancode_make_results_summary(self, regen=FIXTURES_REGEN): # Ensure the policies index is empty to avoid any side effect on results From eb7157eff2b26d9d0218c0b45359916f5965f88e Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 20 Jan 2025 20:55:43 +0100 Subject: [PATCH 07/10] Bump scancode-toolkit to v32.3.2 Signed-off-by: Ayan Sinha Mahapatra --- .../data/manifests/openpdf-parent-1.3.11_scan_package.json | 2 +- scanpipe/tests/data/scancode/is-npm-1.0.0_scan_package.json | 2 +- .../tests/data/scancode/multiple-is-npm-1.0.0_scan_package.json | 2 +- setup.cfg | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scanpipe/tests/data/manifests/openpdf-parent-1.3.11_scan_package.json b/scanpipe/tests/data/manifests/openpdf-parent-1.3.11_scan_package.json index 5db7295b7..b8ff95d88 100644 --- a/scanpipe/tests/data/manifests/openpdf-parent-1.3.11_scan_package.json +++ b/scanpipe/tests/data/manifests/openpdf-parent-1.3.11_scan_package.json @@ -19,7 +19,7 @@ "errors": [], "warnings": [], "extra_data": { - "spdx_license_list_version": "3.25", + "spdx_license_list_version": "3.26", "files_count": 1 } } diff --git a/scanpipe/tests/data/scancode/is-npm-1.0.0_scan_package.json b/scanpipe/tests/data/scancode/is-npm-1.0.0_scan_package.json index 3081eadbd..24c18299c 100644 --- a/scanpipe/tests/data/scancode/is-npm-1.0.0_scan_package.json +++ b/scanpipe/tests/data/scancode/is-npm-1.0.0_scan_package.json @@ -19,7 +19,7 @@ "errors": [], "warnings": [], "extra_data": { - "spdx_license_list_version": "3.25", + "spdx_license_list_version": "3.26", "files_count": 3 } } diff --git a/scanpipe/tests/data/scancode/multiple-is-npm-1.0.0_scan_package.json b/scanpipe/tests/data/scancode/multiple-is-npm-1.0.0_scan_package.json index c560a27e6..6ed69bf23 100644 --- a/scanpipe/tests/data/scancode/multiple-is-npm-1.0.0_scan_package.json +++ b/scanpipe/tests/data/scancode/multiple-is-npm-1.0.0_scan_package.json @@ -19,7 +19,7 @@ "errors": [], "warnings": [], "extra_data": { - "spdx_license_list_version": "3.25", + "spdx_license_list_version": "3.26", "files_count": 6 } } diff --git a/setup.cfg b/setup.cfg index 5e2920f6e..e762a17b3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -72,7 +72,7 @@ install_requires = # Docker container-inspector==33.0.0 # ScanCode-toolkit - scancode-toolkit[packages] @ git+https://github.com/nexB/scancode-toolkit.git@c9c023e7712044475578d03814f34bb03435d949 + scancode-toolkit[packages]==32.3.2 extractcode[full]==31.0.0 commoncode==32.1.0 packageurl-python==0.16.0 From 653d804e561820432255aec2a7847c5951b061b4 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 21 Jan 2025 10:47:17 +0100 Subject: [PATCH 08/10] Add scan_max_file_size to project settings Signed-off-by: Ayan Sinha Mahapatra --- scanpipe/forms.py | 10 ++++++++++ scanpipe/models.py | 10 ++++++++++ scanpipe/pipes/scancode.py | 19 +++++++++++++++---- scanpipe/tests/test_forms.py | 2 ++ 4 files changed, 37 insertions(+), 4 deletions(-) diff --git a/scanpipe/forms.py b/scanpipe/forms.py index d8c539258..ad8cb5277 100644 --- a/scanpipe/forms.py +++ b/scanpipe/forms.py @@ -416,6 +416,7 @@ class ProjectSettingsForm(forms.ModelForm): "ignored_vulnerabilities", "policies", "attribution_template", + "scan_max_file_size", "product_name", "product_version", ] @@ -490,6 +491,15 @@ class ProjectSettingsForm(forms.ModelForm): ), widget=forms.Textarea(attrs={"class": "textarea is-dynamic", "rows": 3}), ) + scan_max_file_size = forms.IntegerField( + label="Max file size to scan", + required=False, + help_text=( + "Maximum file size in bytes which should be skipped from scanning." + "File size is in bytes. Example: 5 MB is 5242880 bytes." + ), + widget=forms.NumberInput(attrs={"class": "input"}), + ) product_name = forms.CharField( label="Product name", required=False, diff --git a/scanpipe/models.py b/scanpipe/models.py index e419a79e2..878056aca 100644 --- a/scanpipe/models.py +++ b/scanpipe/models.py @@ -918,6 +918,16 @@ def get_ignored_dependency_scopes_index(self): return dict(ignored_scope_index) + @cached_property + def get_scan_max_file_size(self): + """ + Return a the ``scan_max_file_size`` settings value defined in this + Project env. + """ + scan_max_file_size = self.get_env(field_name="scan_max_file_size") + if scan_max_file_size: + return scan_max_file_size + @cached_property def ignored_dependency_scopes_index(self): """ diff --git a/scanpipe/pipes/scancode.py b/scanpipe/pipes/scancode.py index 246e80b46..13468fd50 100644 --- a/scanpipe/pipes/scancode.py +++ b/scanpipe/pipes/scancode.py @@ -293,6 +293,7 @@ def scan_resources( save_func, scan_func_kwargs=None, progress_logger=None, + file_size_limit=None, ): """ Run the `scan_func` on the codebase resources of the provided `resource_qs`. @@ -313,10 +314,13 @@ def scan_resources( scan_func_kwargs = {} # Skip scannning files larger than the specified max size - if not scan_func == scan_for_package_data: - flag.flag_and_ignore_files_over_max_size( - resource_qs=resource_qs, - file_size_limit=settings.SCANCODEIO_SCAN_MAX_FILE_SIZE, + skipped_files_max_size = flag.flag_and_ignore_files_over_max_size( + resource_qs=resource_qs, + file_size_limit=file_size_limit, + ) + if file_size_limit and skipped_files_max_size: + logger.info( + f"Skipped {skipped_files_max_size} files over the size of {file_size_limit}" ) scan_resource_qs = resource_qs.filter(~Q(status=flag.IGNORED_BY_MAX_FILE_SIZE)) @@ -378,11 +382,18 @@ def scan_for_files(project, resource_qs=None, progress_logger=None): if resource_qs is None: resource_qs = project.codebaseresources.no_status() + # Get max file size limit set in project settings, or alternatively + # get it from scancodeio settings + file_size_limit = project.get_scan_max_file_size + if not file_size_limit: + file_size_limit = settings.SCANCODEIO_SCAN_MAX_FILE_SIZE + scan_resources( resource_qs=resource_qs, scan_func=scan_file, save_func=save_scan_file_results, progress_logger=progress_logger, + file_size_limit=file_size_limit, ) diff --git a/scanpipe/tests/test_forms.py b/scanpipe/tests/test_forms.py index b46e312b8..5d352ef0b 100644 --- a/scanpipe/tests/test_forms.py +++ b/scanpipe/tests/test_forms.py @@ -142,6 +142,7 @@ def test_scanpipe_forms_project_settings_form_update_project_settings(self): "ignored_vulnerabilities": None, "policies": "", "ignored_dependency_scopes": None, + "scan_max_file_size": None, "product_name": "", "product_version": "", "attribution_template": "", @@ -194,6 +195,7 @@ def test_scanpipe_forms_project_settings_form_ignored_dependency_scopes(self): {"package_type": "npm", "scope": "devDependencies"}, {"package_type": "pypi", "scope": "tests"}, ], + "scan_max_file_size": None, "attribution_template": "", "policies": "", "product_name": "", From 74e1dd24ca82a821d3c7adc31f8fcac00117442c Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 21 Jan 2025 11:07:33 +0100 Subject: [PATCH 09/10] Update CHANGELOG and docs on project settings Signed-off-by: Ayan Sinha Mahapatra --- CHANGELOG.rst | 10 ++++++++++ docs/project-configuration.rst | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 043ca3723..ea42ec02a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -43,6 +43,16 @@ v34.9.4 (unreleased) sheets with a dedicated VULNERABILITIES sheet. https://github.com/aboutcode-org/scancode.io/issues/1519 +- Update scancode-toolkit to v32.3.2. See CHANGELOG for updates: + https://github.com/aboutcode-org/scancode-toolkit/releases/tag/v32.3.2 + https://github.com/aboutcode-org/scancode-toolkit/releases/tag/v32.3.1 + +- Adds a project settings `scan_max_file_size` and a scancode.io settings field + `SCANCODEIO_SCAN_MAX_FILE_SIZE` to skip scanning files above a certain + file size (in bytes) as a temporary fix for large memory spikes while + scanning for licenses in certain large files. + https://github.com/aboutcode-org/scancode-toolkit/issues/3711 + v34.9.3 (2024-12-31) -------------------- diff --git a/docs/project-configuration.rst b/docs/project-configuration.rst index b41a55644..e7031d4f7 100644 --- a/docs/project-configuration.rst +++ b/docs/project-configuration.rst @@ -54,6 +54,7 @@ Content of a ``scancode-config.yml`` file: ignored_patterns: - '*.tmp' - 'tests/*' + scan_max_file_size: 5242880 ignored_dependency_scopes: - package_type: npm scope: devDependencies @@ -86,6 +87,24 @@ product_version The product version of this project, as specified within the DejaCode application. +scan_max_file_size +^^^^^^^^^^^^^^^^^^ + +Maximum file size allowed for a file to be scanned when scanning a codebase. + +The value unit is bytes and is defined as an integer, see the following +example of setting this at 5 MB:: + + scan_max_file_size=5242880 + +Default is ``None``, in which case all files will be scanned. + +.. note:: + This is the same as the scancodeio setting ``SCANCODEIO_SCAN_MAX_FILE_SIZE`` + set using the .env file, and the project setting ``scan_max_file_size`` takes + precedence over the scancodeio setting ``SCANCODEIO_SCAN_MAX_FILE_SIZE``. + + ignored_patterns ^^^^^^^^^^^^^^^^ From 782785c4c04f4c311607b746f74716601c003297 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 21 Jan 2025 11:15:52 +0100 Subject: [PATCH 10/10] Add scan_max_file_size to project settings UI Signed-off-by: Ayan Sinha Mahapatra --- scanpipe/templates/scanpipe/project_settings.html | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scanpipe/templates/scanpipe/project_settings.html b/scanpipe/templates/scanpipe/project_settings.html index 9bc32b230..562eeeaf3 100644 --- a/scanpipe/templates/scanpipe/project_settings.html +++ b/scanpipe/templates/scanpipe/project_settings.html @@ -114,6 +114,18 @@ +
+ +
+ {{ form.scan_max_file_size }} +
+
+ {{ form.scan_max_file_size.help_text|safe|linebreaksbr }} +
+
+