Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ignore large data files and bump scancode-toolkit #1508

Merged
merged 13 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions docs/application-settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,18 @@ The value unit is second and is defined as an integer::

Default: ``120`` (2 minutes)

SCANCODEIO_SCAN_MAX_FILE_SIZE
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Maximum file size allowed for a file to be scanned when scanning a codebase.

The value unit is bytes and is defined as an integer, see the following
example of setting this at 5 MB::

SCANCODEIO_SCAN_MAX_FILE_SIZE=5242880

Default: ``None`` (all files will be scanned)

.. _scancodeio_settings_pipelines_dirs:

SCANCODEIO_PIPELINES_DIRS
Expand Down
3 changes: 3 additions & 0 deletions scancodeio/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@
# Default to 2 minutes.
SCANCODEIO_SCAN_FILE_TIMEOUT = env.int("SCANCODEIO_SCAN_FILE_TIMEOUT", default=120)

# Default to None which scans all files
SCANCODEIO_SCAN_MAX_FILE_SIZE = env.int("SCANCODEIO_SCAN_MAX_FILE_SIZE", default=None)

# List views pagination, controls the number of items displayed per page.
# Syntax in .env: SCANCODEIO_PAGINATE_BY=project=10,project_error=10
SCANCODEIO_PAGINATE_BY = env.dict(
Expand Down
10 changes: 10 additions & 0 deletions scanpipe/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ class ProjectSettingsForm(forms.ModelForm):
"ignored_vulnerabilities",
"policies",
"attribution_template",
"scan_max_file_size",
"product_name",
"product_version",
]
Expand Down Expand Up @@ -490,6 +491,15 @@ class ProjectSettingsForm(forms.ModelForm):
),
widget=forms.Textarea(attrs={"class": "textarea is-dynamic", "rows": 3}),
)
scan_max_file_size = forms.IntegerField(
label="Max file size to scan",
required=False,
help_text=(
"Maximum file size in bytes which should be skipped from scanning."
"File size is in bytes. Example: 5 MB is 5242880 bytes."
),
widget=forms.NumberInput(attrs={"class": "input"}),
)
product_name = forms.CharField(
label="Product name",
required=False,
Expand Down
10 changes: 10 additions & 0 deletions scanpipe/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,6 +918,16 @@ def get_ignored_dependency_scopes_index(self):

return dict(ignored_scope_index)

@cached_property
def get_scan_max_file_size(self):
"""
Return a the ``scan_max_file_size`` settings value defined in this
Project env.
"""
scan_max_file_size = self.get_env(field_name="scan_max_file_size")
if scan_max_file_size:
return scan_max_file_size

@cached_property
def ignored_dependency_scopes_index(self):
"""
Expand Down
15 changes: 15 additions & 0 deletions scanpipe/pipes/flag.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.


NO_STATUS = ""

SCANNED = "scanned"
Expand All @@ -43,6 +44,7 @@
IGNORED_DEFAULT_IGNORES = "ignored-default-ignores"
IGNORED_DATA_FILE_NO_CLUES = "ignored-data-file-no-clues"
IGNORED_DOC_FILE = "ignored-doc-file"
IGNORED_BY_MAX_FILE_SIZE = "ignored-by-max-file-size"

COMPLIANCE_LICENSES = "compliance-licenses"
COMPLIANCE_SOURCEMIRROR = "compliance-sourcemirror"
Expand Down Expand Up @@ -102,6 +104,19 @@ def flag_ignored_patterns(project, patterns):
return update_count


def flag_and_ignore_files_over_max_size(resource_qs, file_size_limit):
"""
Flag codebase resources which are over the max file size for scanning
and return all other files within the file size limit.
"""
if not file_size_limit:
return resource_qs

return resource_qs.filter(size__gte=file_size_limit).update(
status=IGNORED_BY_MAX_FILE_SIZE
)


def analyze_scanned_files(project):
"""Set the status for CodebaseResource to unknown or no license."""
scanned_files = project.codebaseresources.files().status(SCANNED)
Expand Down
35 changes: 25 additions & 10 deletions scanpipe/pipes/scancode.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from django.apps import apps
from django.conf import settings
from django.db.models import ObjectDoesNotExist
from django.db.models import Q

from commoncode import fileutils
from commoncode.resource import VirtualCodebase
Expand All @@ -58,6 +59,7 @@
Utilities to deal with ScanCode toolkit features and objects.
"""


scanpipe_app = apps.get_app_config("scanpipe")


Expand Down Expand Up @@ -291,6 +293,7 @@ def scan_resources(
save_func,
scan_func_kwargs=None,
progress_logger=None,
file_size_limit=None,
):
"""
Run the `scan_func` on the codebase resources of the provided `resource_qs`.
Expand All @@ -310,9 +313,21 @@ def scan_resources(
if not scan_func_kwargs:
scan_func_kwargs = {}

resource_count = resource_qs.count()
# Skip scannning files larger than the specified max size
skipped_files_max_size = flag.flag_and_ignore_files_over_max_size(
resource_qs=resource_qs,
file_size_limit=file_size_limit,
)
if file_size_limit and skipped_files_max_size:
logger.info(
f"Skipped {skipped_files_max_size} files over the size of {file_size_limit}"
)

scan_resource_qs = resource_qs.filter(~Q(status=flag.IGNORED_BY_MAX_FILE_SIZE))

resource_count = scan_resource_qs.count()
logger.info(f"Scan {resource_count} codebase resources with {scan_func.__name__}")
resource_iterator = resource_qs.iterator(chunk_size=2000)
resource_iterator = scan_resource_qs.iterator(chunk_size=2000)
progress = LoopProgress(resource_count, logger=progress_logger)
max_workers = get_max_workers(keep_available=1)

Expand Down Expand Up @@ -350,14 +365,7 @@ def scan_resources(
"Please ensure that there is at least 2 GB of available memory per "
"CPU core for successful execution."
)

resource.project.add_error(
exception=broken_pool_error,
model="scan_resources",
description=message,
object_instance=resource,
)
continue
raise broken_pool_error from InsufficientResourcesError(message)
AyanSinhaMahapatra marked this conversation as resolved.
Show resolved Hide resolved

save_func(resource, scan_results, scan_errors)

Expand All @@ -374,11 +382,18 @@ def scan_for_files(project, resource_qs=None, progress_logger=None):
if resource_qs is None:
resource_qs = project.codebaseresources.no_status()

# Get max file size limit set in project settings, or alternatively
# get it from scancodeio settings
file_size_limit = project.get_scan_max_file_size
if not file_size_limit:
file_size_limit = settings.SCANCODEIO_SCAN_MAX_FILE_SIZE

scan_resources(
resource_qs=resource_qs,
scan_func=scan_file,
save_func=save_scan_file_results,
progress_logger=progress_logger,
file_size_limit=file_size_limit,
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"errors": [],
"warnings": [],
"extra_data": {
"spdx_license_list_version": "3.25",
"spdx_license_list_version": "3.26",
"files_count": 1
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"errors": [],
"warnings": [],
"extra_data": {
"spdx_license_list_version": "3.25",
"spdx_license_list_version": "3.26",
"files_count": 3
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"errors": [],
"warnings": [],
"extra_data": {
"spdx_license_list_version": "3.25",
"spdx_license_list_version": "3.26",
"files_count": 6
}
}
Expand Down
17 changes: 17 additions & 0 deletions scanpipe/tests/pipes/test_scancode.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from scanpipe.models import DiscoveredPackage
from scanpipe.models import Project
from scanpipe.pipes import collect_and_create_codebase_resources
from scanpipe.pipes import flag
from scanpipe.pipes import input
from scanpipe.pipes import scancode
from scanpipe.pipes.input import copy_input
Expand Down Expand Up @@ -445,6 +446,22 @@ def test_scanpipe_pipes_scancode_run_scan_args(self, mock_run_scan):
run_scan_kwargs = mock_run_scan.call_args.kwargs
self.assertEqual(expected_processes, run_scan_kwargs.get("processes"))

def test_scanpipe_max_file_size_works(self):
with override_settings(SCANCODEIO_SCAN_MAX_FILE_SIZE=10000):
project1 = Project.objects.create(name="Analysis")
input_location = self.data / "d2d-rust" / "to-trustier-binary-linux.tar.gz"
project1.copy_input_from(input_location)

run = project1.add_pipeline("scan_codebase")
pipeline = run.make_pipeline_instance()

exitcode, out = pipeline.execute()
self.assertEqual(0, exitcode, msg=out)
resource1 = project1.codebaseresources.get(
path="to-trustier-binary-linux.tar.gz-extract/trustier"
)
self.assertEqual(resource1.status, flag.IGNORED_BY_MAX_FILE_SIZE)

def test_scanpipe_pipes_scancode_make_results_summary(self, regen=FIXTURES_REGEN):
# Ensure the policies index is empty to avoid any side effect on results
scanpipe_app.license_policies_index = None
Expand Down
2 changes: 2 additions & 0 deletions scanpipe/tests/test_forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ def test_scanpipe_forms_project_settings_form_update_project_settings(self):
"ignored_vulnerabilities": None,
"policies": "",
"ignored_dependency_scopes": None,
"scan_max_file_size": None,
"product_name": "",
"product_version": "",
"attribution_template": "",
Expand Down Expand Up @@ -194,6 +195,7 @@ def test_scanpipe_forms_project_settings_form_ignored_dependency_scopes(self):
{"package_type": "npm", "scope": "devDependencies"},
{"package_type": "pypi", "scope": "tests"},
],
"scan_max_file_size": None,
"attribution_template": "",
"policies": "",
"product_name": "",
Expand Down
6 changes: 3 additions & 3 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,16 @@ install_requires =
# Docker
container-inspector==33.0.0
# ScanCode-toolkit
scancode-toolkit[packages]==32.3.0
scancode-toolkit[packages]==32.3.2
extractcode[full]==31.0.0
commoncode==32.1.0
packageurl-python==0.16.0
# FetchCode
fetchcode-container==1.2.3.210512; sys_platform == "linux"
# Inspectors
elf-inspector==0.0.1
go-inspector==0.5.0; sys_platform == "linux"
rust-inspector==0.1.0; sys_platform == "linux"
go-inspector==0.5.0
rust-inspector==0.1.0
python-inspector==0.12.1
source-inspector==0.5.1; sys_platform != "darwin" and platform_machine != "arm64"
aboutcode-toolkit==11.0.0
Expand Down