Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ignore large data files and bump scancode-toolkit #1508

Merged
merged 13 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Ignore scanning large data files
Ignore scanning large data files which are larger than 1 MB
to avoid crashing scans on memory spikes.
Also rollback #1504

Reference: aboutcode-org/scancode-toolkit#3711
Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
  • Loading branch information
AyanSinhaMahapatra committed Jan 6, 2025
commit fc51df839c80dd1ede4d44e6ff4f9e511378e75d
1 change: 1 addition & 0 deletions scanpipe/pipes/flag.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
IGNORED_DEFAULT_IGNORES = "ignored-default-ignores"
IGNORED_DATA_FILE_NO_CLUES = "ignored-data-file-no-clues"
IGNORED_DOC_FILE = "ignored-doc-file"
IGNORED_LARGE_DATA_FILE = "ignored-large-data-file"

COMPLIANCE_LICENSES = "compliance-licenses"
COMPLIANCE_SOURCEMIRROR = "compliance-sourcemirror"
Expand Down
29 changes: 19 additions & 10 deletions scanpipe/pipes/scancode.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from django.apps import apps
from django.conf import settings
from django.db.models import ObjectDoesNotExist
from django.db.models import Q

from commoncode import fileutils
from commoncode.resource import VirtualCodebase
Expand All @@ -58,6 +59,13 @@
Utilities to deal with ScanCode toolkit features and objects.
"""

# skip large data files which are causing memory spikes
# See https://github.com/aboutcode-org/scancode-toolkit/issues/3711
# TODO: this has to be removed once the issue is fixed
SKIP_DATA_FILE_EXTENSIONS = [".json", ".xml", ".yaml", ".yml"]
SKIP_DATA_FILE_SIZE = 1048576


scanpipe_app = apps.get_app_config("scanpipe")


Expand Down Expand Up @@ -310,9 +318,17 @@ def scan_resources(
if not scan_func_kwargs:
scan_func_kwargs = {}

resource_count = resource_qs.count()
# Skip scannning data files larger than 1 MB
if not scan_func == scan_for_package_data:
resource_qs.filter(
Q(extension__in=SKIP_DATA_FILE_EXTENSIONS)
& Q(size__gte=SKIP_DATA_FILE_SIZE)
).update(status=flag.IGNORED_LARGE_DATA_FILE)
scan_resource_qs = resource_qs.filter(~Q(status=flag.IGNORED_LARGE_DATA_FILE))

resource_count = scan_resource_qs.count()
logger.info(f"Scan {resource_count} codebase resources with {scan_func.__name__}")
resource_iterator = resource_qs.iterator(chunk_size=2000)
resource_iterator = scan_resource_qs.iterator(chunk_size=2000)
progress = LoopProgress(resource_count, logger=progress_logger)
max_workers = get_max_workers(keep_available=1)

Expand Down Expand Up @@ -350,14 +366,7 @@ def scan_resources(
"Please ensure that there is at least 2 GB of available memory per "
"CPU core for successful execution."
)

resource.project.add_error(
exception=broken_pool_error,
model="scan_resources",
description=message,
object_instance=resource,
)
continue
raise broken_pool_error from InsufficientResourcesError(message)

save_func(resource, scan_results, scan_errors)

Expand Down