Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ignore large data files and bump scancode-toolkit #1508

Merged
merged 13 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add a scancodeio setting SCANCODEIO_SCAN_MAX_FILE_SIZE
Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
  • Loading branch information
AyanSinhaMahapatra committed Jan 7, 2025
commit 89d3ee465f4c9b4a19cfa1d040a5c64bcd5f9602
12 changes: 12 additions & 0 deletions docs/application-settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,18 @@ The value unit is second and is defined as an integer::

Default: ``120`` (2 minutes)

SCANCODEIO_SCAN_MAX_FILE_SIZE
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Maximum file size allowed for a file to be scanned when scanning a codebase.

The value unit is bytes and is defined as an integer, see the following
example of setting this at 5 MB::

SCANCODEIO_SCAN_MAX_FILE_SIZE=5242880

Default: ``None`` (all files will be scanned)

.. _scancodeio_settings_pipelines_dirs:

SCANCODEIO_PIPELINES_DIRS
Expand Down
3 changes: 3 additions & 0 deletions scancodeio/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@
# Default to 2 minutes.
SCANCODEIO_SCAN_FILE_TIMEOUT = env.int("SCANCODEIO_SCAN_FILE_TIMEOUT", default=120)

# Default to None which scans all files
SCANCODEIO_SCAN_MAX_FILE_SIZE = env.int("SCANCODEIO_SCAN_MAX_FILE_SIZE", default=None)

# List views pagination, controls the number of items displayed per page.
# Syntax in .env: SCANCODEIO_PAGINATE_BY=project=10,project_error=10
SCANCODEIO_PAGINATE_BY = env.dict(
Expand Down
18 changes: 17 additions & 1 deletion scanpipe/pipes/flag.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from django.db.models import Q

NO_STATUS = ""

SCANNED = "scanned"
Expand All @@ -43,7 +45,7 @@
IGNORED_DEFAULT_IGNORES = "ignored-default-ignores"
IGNORED_DATA_FILE_NO_CLUES = "ignored-data-file-no-clues"
IGNORED_DOC_FILE = "ignored-doc-file"
IGNORED_LARGE_DATA_FILE = "ignored-large-data-file"
IGNORED_BY_MAX_FILE_SIZE = "ignored-by-max-file-size"

COMPLIANCE_LICENSES = "compliance-licenses"
COMPLIANCE_SOURCEMIRROR = "compliance-sourcemirror"
Expand Down Expand Up @@ -103,6 +105,20 @@ def flag_ignored_patterns(project, patterns):
return update_count


def flag_and_ignore_files_over_max_size(resource_qs, file_size_limit):
"""
Flag codebase resources which are over the max file size for scanning
and return all other files within the file size limit.
"""
if not file_size_limit:
return resource_qs

resource_qs.filter(size__gte=file_size_limit).update(
status=IGNORED_BY_MAX_FILE_SIZE
)
return resource_qs.filter(~Q(status=IGNORED_BY_MAX_FILE_SIZE))


def analyze_scanned_files(project):
"""Set the status for CodebaseResource to unknown or no license."""
scanned_files = project.codebaseresources.files().status(SCANNED)
Expand Down
22 changes: 7 additions & 15 deletions scanpipe/pipes/scancode.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
from django.apps import apps
from django.conf import settings
from django.db.models import ObjectDoesNotExist
from django.db.models import Q

from commoncode import fileutils
from commoncode.resource import VirtualCodebase
Expand All @@ -59,12 +58,6 @@
Utilities to deal with ScanCode toolkit features and objects.
"""

# skip large data files which are causing memory spikes
# See https://github.com/aboutcode-org/scancode-toolkit/issues/3711
# TODO: this has to be removed once the issue is fixed
SKIP_DATA_FILE_EXTENSIONS = [".json", ".xml", ".yaml", ".yml"]
SKIP_DATA_FILE_SIZE = 5242880


scanpipe_app = apps.get_app_config("scanpipe")

Expand Down Expand Up @@ -318,17 +311,16 @@ def scan_resources(
if not scan_func_kwargs:
scan_func_kwargs = {}

# Skip scannning data files larger than 5 MB
# Skip scannning files larger than the specified max size
if not scan_func == scan_for_package_data:
resource_qs.filter(
Q(extension__in=SKIP_DATA_FILE_EXTENSIONS)
& Q(size__gte=SKIP_DATA_FILE_SIZE)
).update(status=flag.IGNORED_LARGE_DATA_FILE)
scan_resource_qs = resource_qs.filter(~Q(status=flag.IGNORED_LARGE_DATA_FILE))
resource_qs = flag.flag_and_ignore_files_over_max_size(
resource_qs=resource_qs,
file_size_limit=settings.SCANCODEIO_SCAN_MAX_FILE_SIZE,
)

resource_count = scan_resource_qs.count()
resource_count = resource_qs.count()
logger.info(f"Scan {resource_count} codebase resources with {scan_func.__name__}")
resource_iterator = scan_resource_qs.iterator(chunk_size=2000)
resource_iterator = resource_qs.iterator(chunk_size=2000)
progress = LoopProgress(resource_count, logger=progress_logger)
max_workers = get_max_workers(keep_available=1)

Expand Down
19 changes: 19 additions & 0 deletions scanpipe/tests/pipes/test_scancode.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from scanpipe.models import DiscoveredPackage
from scanpipe.models import Project
from scanpipe.pipes import collect_and_create_codebase_resources
from scanpipe.pipes import flag
from scanpipe.pipes import input
from scanpipe.pipes import scancode
from scanpipe.pipes.input import copy_input
Expand Down Expand Up @@ -445,6 +446,24 @@ def test_scanpipe_pipes_scancode_run_scan_args(self, mock_run_scan):
run_scan_kwargs = mock_run_scan.call_args.kwargs
self.assertEqual(expected_processes, run_scan_kwargs.get("processes"))

def test_scanpipe_max_file_size_works(self):
with override_settings(SCANCODEIO_SCAN_MAX_FILE_SIZE=10000):
project1 = Project.objects.create(name="Analysis")
input_location = self.data / "d2d-rust" / "to-trustier-binary-linux.tar.gz"
project1.copy_input_from(input_location)

run = project1.add_pipeline("scan_codebase")
pipeline = run.make_pipeline_instance()

exitcode, out = pipeline.execute()
self.assertEqual(0, exitcode, msg=out)
self.assertEqual(
project1.codebaseresources.get(
path="to-trustier-binary-linux.tar.gz-extract/trustier"
).status,
flag.IGNORED_BY_MAX_FILE_SIZE,
)

def test_scanpipe_pipes_scancode_make_results_summary(self, regen=FIXTURES_REGEN):
# Ensure the policies index is empty to avoid any side effect on results
scanpipe_app.license_policies_index = None
Expand Down