From 0be8ff938d3dd6a2317e267cf2d25500f66c02ca Mon Sep 17 00:00:00 2001 From: Thomas Druez Date: Fri, 3 Mar 2023 18:34:53 +0100 Subject: [PATCH] Add support for ScanCode.io results in the LoadInventory pipeline #609 Signed-off-by: Thomas Druez --- CHANGELOG.rst | 3 ++ scanpipe/models.py | 16 +++++++---- scanpipe/pipelines/load_inventory.py | 28 +++++++++++++------ scanpipe/pipelines/scan_package.py | 6 ++-- scanpipe/pipes/__init__.py | 22 ++++++++++++++- scanpipe/pipes/input.py | 10 +++++++ scanpipe/pipes/scancode.py | 21 ++++++++++++-- scanpipe/spdx/__init__.py | 14 +++++----- scanpipe/spdx/spdx-schema-2.3.json | 12 ++++---- scanpipe/spdx/test_spdx.py | 12 ++++---- ...asgiref-3.3.0_load_inventory_expected.json | 2 +- scanpipe/tests/test_pipes.py | 22 ++++++++++++++- 12 files changed, 125 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 34b735c30..0ebedbc72 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,9 @@ Changelog v33.0.0 (unreleased) -------------------- +- Add support for ScanCode.io results in the "load_inventory" pipeline. + https://github.com/nexB/scancode.io/issues/609 + - Add support for CycloneDX 1.4 to the "inspect-manifest" pipeline to import SBOM into a Project. https://github.com/nexB/scancode.io/issues/583 diff --git a/scanpipe/models.py b/scanpipe/models.py index 302c89057..060bd5087 100644 --- a/scanpipe/models.py +++ b/scanpipe/models.py @@ -1955,6 +1955,12 @@ def _regroup_numbered_lines(numbered_lines): for line_number, lines_group in groupby(numbered_lines, key=itemgetter(0)): yield line_number, "".join(line for _, line in lines_group) + def add_package(self, discovered_package): + """ + Assign the `discovered_package` to this `codebase_resource` instance. + """ + self.discovered_packages.add(discovered_package) + def create_and_add_package(self, package_data): """ Create a DiscoveredPackage instance using the `package_data` and assigns @@ -1976,10 +1982,8 @@ def create_and_add_package(self, package_data): **package_data, }, ) - return - - if package: - self.discovered_packages.add(package) + else: + self.add_package(package) return package @property @@ -2020,7 +2024,7 @@ def as_spdx(self): return spdx.File( spdx_id=self.spdx_id, name=f"./{self.path}", - checksums=[spdx.Checkum(algorithm="sha1", value=self.sha1)], + checksums=[spdx.Checksum(algorithm="sha1", value=self.sha1)], license_in_files=list(set(spdx_license_keys)), copyright_text=", ".join(copyrights), contributors=list(set(holders + authors)), @@ -2347,7 +2351,7 @@ def as_spdx(self): Return this DiscoveredPackage as an SPDX Package entry. """ checksums = [ - spdx.Checkum(algorithm=algorithm, value=checksum_value) + spdx.Checksum(algorithm=algorithm, value=checksum_value) for algorithm in ["sha1", "md5"] if (checksum_value := getattr(self, algorithm)) ] diff --git a/scanpipe/pipelines/load_inventory.py b/scanpipe/pipelines/load_inventory.py index a0d841062..af5e110dd 100644 --- a/scanpipe/pipelines/load_inventory.py +++ b/scanpipe/pipelines/load_inventory.py @@ -20,14 +20,18 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/nexB/scancode.io for support and download. +import json + from scanpipe.pipelines import Pipeline +from scanpipe.pipes import input from scanpipe.pipes import scancode class LoadInventory(Pipeline): """ - A pipeline to load one or more inventory of files and packages from a ScanCode JSON - scan results. (Presumably containing resource information and package scan data). + A pipeline to load one or more inventory from ScanCode-toolkit and ScanCode.io + JSON scan results. + An inventory is composed of packages, dependencies, and resources. """ @classmethod @@ -42,14 +46,20 @@ def get_scan_json_inputs(self): Locate all the ScanCode JSON scan results from the project's input/ directory. This includes all files with a .json extension. """ - self.input_locations = [ - str(scan_input.absolute()) - for scan_input in self.project.inputs(pattern="*.json") - ] + self.json_input_paths = self.project.inputs(pattern="*.json") def build_inventory_from_scans(self): """ - Process JSON scan results files to populate codebase resources and packages. + Process JSON scan results files to populate packages, dependencies, and + resources. """ - for input_location in self.input_locations: - scancode.create_inventory_from_scan(self.project, input_location) + for input_path in self.json_input_paths: + scan_data = json.loads(input_path.read_text()) + tool_name = input.get_tool_name_from_scan_headers(scan_data) + + if tool_name == "scancode-toolkit": + scancode.load_inventory_from_scan(self.project, input_path) + elif tool_name == "scanpipe": + scancode.load_inventory_from_scanpipe(self.project, scan_data) + else: + raise Exception(f"Input not supported: {str(input_path)} ") diff --git a/scanpipe/pipelines/scan_package.py b/scanpipe/pipelines/scan_package.py index 56e822c29..834c59aa7 100644 --- a/scanpipe/pipelines/scan_package.py +++ b/scanpipe/pipelines/scan_package.py @@ -44,7 +44,7 @@ def steps(cls): cls.collect_archive_information, cls.extract_archive_to_codebase_directory, cls.run_scancode, - cls.build_inventory_from_scan, + cls.load_inventory_from_scan, cls.make_summary_from_scan_results, ) @@ -112,11 +112,11 @@ def run_scancode(self): if not scan_output_path.exists(): raise FileNotFoundError("ScanCode output not available.") - def build_inventory_from_scan(self): + def load_inventory_from_scan(self): """ Process a JSON Scan results file to populate codebase resources and packages. """ - scancode.create_inventory_from_scan(self.project, self.scan_output_location) + scancode.load_inventory_from_scan(self.project, self.scan_output_location) def make_summary_from_scan_results(self): """ diff --git a/scanpipe/pipes/__init__.py b/scanpipe/pipes/__init__.py index e77723b64..933ab7604 100644 --- a/scanpipe/pipes/__init__.py +++ b/scanpipe/pipes/__init__.py @@ -77,9 +77,29 @@ def make_codebase_resource(project, location, **extra_fields): codebase_resource.save(save_error=False) +def update_or_create_resource(project, resource_data): + """ + Get, update or create a CodebaseResource then return it. + """ + resource_path = resource_data.pop("path") + for_packages = resource_data.pop("for_packages", []) + + codebase_resource, _ = CodebaseResource.objects.get_or_create( + project=project, + path=resource_path, + defaults=resource_data, + ) + + for package_uid in for_packages: + package = project.discoveredpackages.get(package_uid=package_uid) + codebase_resource.add_package(package) + + return codebase_resource + + def update_or_create_package(project, package_data, codebase_resource=None): """ - Get, update or create a DiscoveredPackage then returns it. + Get, update or create a DiscoveredPackage then return it. Use the `project` and `package_data` mapping to lookup and creates the DiscoveredPackage using its Package URL and package_uid as a unique key. """ diff --git a/scanpipe/pipes/input.py b/scanpipe/pipes/input.py index 87aa6504f..23de69199 100644 --- a/scanpipe/pipes/input.py +++ b/scanpipe/pipes/input.py @@ -47,3 +47,13 @@ def move_inputs(inputs, dest_path): for input_location in inputs: destination = dest_path / Path(input_location).name shutil.move(input_location, destination) + + +def get_tool_name_from_scan_headers(scan_data): + """ + Return the `tool_name` value of the first header in the provided `scan_data`. + """ + if headers := scan_data.get("headers", []): + first_header = headers[0] + tool_name = first_header.get("tool_name", "") + return tool_name diff --git a/scanpipe/pipes/scancode.py b/scanpipe/pipes/scancode.py index c39c8381c..3783bcec8 100644 --- a/scanpipe/pipes/scancode.py +++ b/scanpipe/pipes/scancode.py @@ -529,7 +529,7 @@ def set_codebase_resource_for_package(codebase_resource, discovered_package): Assign the `discovered_package` to the `codebase_resource` and set its status to "application-package". """ - codebase_resource.discovered_packages.add(discovered_package) + codebase_resource.add_package(discovered_package) codebase_resource.status = "application-package" codebase_resource.save() @@ -607,9 +607,9 @@ def make_results_summary(project, scan_results_location): return summary -def create_inventory_from_scan(project, input_location): +def load_inventory_from_scan(project, input_location): """ - Create CodebaseResource and DiscoveredPackage instances loaded from the scan + Create packages, dependencies, and resources loaded from the ScanCode-toolkit scan results located at `input_location`. """ scanned_codebase = get_virtual_codebase(project, input_location) @@ -618,3 +618,18 @@ def create_inventory_from_scan(project, input_location): create_discovered_dependencies( project, scanned_codebase, strip_datafile_path_root=True ) + + +def load_inventory_from_scanpipe(project, scan_data): + """ + Create packages, dependencies, and resources loaded from a ScanCode.io JSON output + provided as `scan_data`. + """ + for package_data in scan_data.get("packages", []): + pipes.update_or_create_package(project, package_data) + + for resource_data in scan_data.get("files", []): + pipes.update_or_create_resource(project, resource_data) + + for dependency_data in scan_data.get("dependencies", []): + pipes.update_or_create_dependencies(project, dependency_data) diff --git a/scanpipe/spdx/__init__.py b/scanpipe/spdx/__init__.py index ac807d341..989869098 100644 --- a/scanpipe/spdx/__init__.py +++ b/scanpipe/spdx/__init__.py @@ -58,8 +58,8 @@ version="3.3.5", license_concluded="LicenseRef-1", checksums=[ - spdx.Checkum(algorithm="SHA1", value="10c72b88de4c5f3095ebe20b4d8afbedb32b8f"), - spdx.Checkum(algorithm="MD5", value="56770c1a2df6e0dc51c491f0a5b9d865"), + spdx.Checksum(algorithm="SHA1", value="10c72b88de4c5f3095ebe20b4d8afbedb32b8f"), + spdx.Checksum(algorithm="MD5", value="56770c1a2df6e0dc51c491f0a5b9d865"), ], external_refs=[ spdx.ExternalRef( @@ -199,7 +199,7 @@ def get_creators_dict(creators_data): @dataclass -class Checkum: +class Checksum: """ The checksum provides a mechanism that can be used to verify that the contents of a File or Package have not changed. @@ -342,7 +342,7 @@ class Package: comment: str = "" license_comments: str = "" - checksums: List[Checkum] = field(default_factory=list) + checksums: List[Checksum] = field(default_factory=list) external_refs: List[ExternalRef] = field(default_factory=list) attribution_texts: List[str] = field(default_factory=list) @@ -426,7 +426,7 @@ def from_data(cls, data): license_comments=data.get("licenseComments"), attribution_texts=data.get("attributionTexts"), checksums=[ - Checkum.from_data(checksum_data) + Checksum.from_data(checksum_data) for checksum_data in data.get("checksums", []) ], external_refs=[ @@ -444,7 +444,7 @@ class File: spdx_id: str name: str - checksums: List[Checkum] = field(default_factory=list) + checksums: List[Checksum] = field(default_factory=list) license_concluded: str = "NOASSERTION" copyright_text: str = "NOASSERTION" @@ -490,7 +490,7 @@ def from_data(cls, data): spdx_id=data.get("SPDXID"), name=data.get("fileName"), checksums=[ - Checkum.from_data(checksum_data) + Checksum.from_data(checksum_data) for checksum_data in data.get("checksums", []) ], types=data.get("fileTypes"), diff --git a/scanpipe/spdx/spdx-schema-2.3.json b/scanpipe/spdx/spdx-schema-2.3.json index b1da40a57..ee61e6686 100644 --- a/scanpipe/spdx/spdx-schema-2.3.json +++ b/scanpipe/spdx/spdx-schema-2.3.json @@ -81,7 +81,7 @@ "type" : "object", "properties" : { "algorithm" : { - "description" : "Identifies the algorithm used to produce the subject Checkum. Currently, SHA-1 is the only supported algorithm. It is anticipated that other algorithms will be supported at a later time.", + "description" : "Identifies the algorithm used to produce the subject Checksum. Currently, SHA-1 is the only supported algorithm. It is anticipated that other algorithms will be supported at a later time.", "type" : "string", "enum" : [ "SHA1", "BLAKE3", "SHA3-384", "SHA256", "SHA384", "BLAKE2b-512", "BLAKE2b-256", "SHA3-512", "MD2", "ADLER32", "MD4", "SHA3-256", "BLAKE2b-384", "SHA512", "MD6", "MD5", "SHA224" ] }, @@ -92,7 +92,7 @@ }, "required" : [ "algorithm", "checksumValue" ], "additionalProperties" : false, - "description" : "A Checkum is value that allows the contents of a file to be authenticated. Even small changes to the content of the file will change its checksum. This class allows the results of a variety of checksum and cryptographic message digest algorithms to be represented." + "description" : "A Checksum is value that allows the contents of a file to be authenticated. Even small changes to the content of the file will change its checksum. This class allows the results of a variety of checksum and cryptographic message digest algorithms to be represented." }, "externalDocumentId" : { "description" : "externalDocumentId is a string containing letters, numbers, ., - and/or + which uniquely identifies an external document within this document.", @@ -281,7 +281,7 @@ "type" : "object", "properties" : { "algorithm" : { - "description" : "Identifies the algorithm used to produce the subject Checkum. Currently, SHA-1 is the only supported algorithm. It is anticipated that other algorithms will be supported at a later time.", + "description" : "Identifies the algorithm used to produce the subject Checksum. Currently, SHA-1 is the only supported algorithm. It is anticipated that other algorithms will be supported at a later time.", "type" : "string", "enum" : [ "SHA1", "BLAKE3", "SHA3-384", "SHA256", "SHA384", "BLAKE2b-512", "BLAKE2b-256", "SHA3-512", "MD2", "ADLER32", "MD4", "SHA3-256", "BLAKE2b-384", "SHA512", "MD6", "MD5", "SHA224" ] }, @@ -292,7 +292,7 @@ }, "required" : [ "algorithm", "checksumValue" ], "additionalProperties" : false, - "description" : "A Checkum is value that allows the contents of a file to be authenticated. Even small changes to the content of the file will change its checksum. This class allows the results of a variety of checksum and cryptographic message digest algorithms to be represented." + "description" : "A Checksum is value that allows the contents of a file to be authenticated. Even small changes to the content of the file will change its checksum. This class allows the results of a variety of checksum and cryptographic message digest algorithms to be represented." } }, "comment" : { @@ -500,7 +500,7 @@ "type" : "object", "properties" : { "algorithm" : { - "description" : "Identifies the algorithm used to produce the subject Checkum. Currently, SHA-1 is the only supported algorithm. It is anticipated that other algorithms will be supported at a later time.", + "description" : "Identifies the algorithm used to produce the subject Checksum. Currently, SHA-1 is the only supported algorithm. It is anticipated that other algorithms will be supported at a later time.", "type" : "string", "enum" : [ "SHA1", "BLAKE3", "SHA3-384", "SHA256", "SHA384", "BLAKE2b-512", "BLAKE2b-256", "SHA3-512", "MD2", "ADLER32", "MD4", "SHA3-256", "BLAKE2b-384", "SHA512", "MD6", "MD5", "SHA224" ] }, @@ -511,7 +511,7 @@ }, "required" : [ "algorithm", "checksumValue" ], "additionalProperties" : false, - "description" : "A Checkum is value that allows the contents of a file to be authenticated. Even small changes to the content of the file will change its checksum. This class allows the results of a variety of checksum and cryptographic message digest algorithms to be represented." + "description" : "A Checksum is value that allows the contents of a file to be authenticated. Even small changes to the content of the file will change its checksum. This class allows the results of a variety of checksum and cryptographic message digest algorithms to be represented." } }, "comment" : { diff --git a/scanpipe/spdx/test_spdx.py b/scanpipe/spdx/test_spdx.py index 5da6c4755..b8312c092 100644 --- a/scanpipe/spdx/test_spdx.py +++ b/scanpipe/spdx/test_spdx.py @@ -97,8 +97,8 @@ def setUp(self): "license_concluded": "LicenseRef-1", "release_date": "2000-01-01", "checksums": [ - spdx.Checkum(**self.checksum_sha1_data), - spdx.Checkum(**self.checksum_md5_data), + spdx.Checksum(**self.checksum_sha1_data), + spdx.Checksum(**self.checksum_md5_data), ], "external_refs": [spdx.ExternalRef(**self.external_ref_purl_data)], } @@ -135,7 +135,7 @@ def setUp(self): "name": "file.txt", "license_concluded": "LicenseRef-1", "checksums": [ - spdx.Checkum(**self.checksum_sha1_data), + spdx.Checksum(**self.checksum_sha1_data), ], "types": ["TEXT"], "comment": "comment", @@ -282,12 +282,12 @@ def test_spdx_creation_info_missing_data(self): assert "Missing values to build `creators` list." == str(error.exception) def test_spdx_checksum_as_dict(self): - checksum = spdx.Checkum(**self.checksum_sha1_data) + checksum = spdx.Checksum(**self.checksum_sha1_data) assert self.checksum_sha1_spdx_data == checksum.as_dict() def test_spdx_checksum_from_data(self): - assert spdx.Checkum.from_data({}) - checksum = spdx.Checkum.from_data(self.checksum_sha1_spdx_data) + assert spdx.Checksum.from_data({}) + checksum = spdx.Checksum.from_data(self.checksum_sha1_spdx_data) assert self.checksum_sha1_spdx_data == checksum.as_dict() def test_spdx_external_ref_as_dict(self): diff --git a/scanpipe/tests/data/asgiref-3.3.0_load_inventory_expected.json b/scanpipe/tests/data/asgiref-3.3.0_load_inventory_expected.json index b7d201ebf..07a40fef0 100644 --- a/scanpipe/tests/data/asgiref-3.3.0_load_inventory_expected.json +++ b/scanpipe/tests/data/asgiref-3.3.0_load_inventory_expected.json @@ -8,7 +8,7 @@ { "pipeline_name": "load_inventory", "status": "not_started", - "description": "A pipeline to load one or more inventory of files and packages from a ScanCode JSON\nscan results. (Presumably containing resource information and package scan data).", + "description": "A pipeline to load one or more inventory from ScanCode-toolkit and ScanCode.io\nJSON scan results.\nAn inventory is composed of packages, dependencies, and resources.", "scancodeio_version": "", "task_id": null, "task_start_date": null, diff --git a/scanpipe/tests/test_pipes.py b/scanpipe/tests/test_pipes.py index f511a2d66..d874435c0 100644 --- a/scanpipe/tests/test_pipes.py +++ b/scanpipe/tests/test_pipes.py @@ -44,6 +44,7 @@ from scanpipe.pipes import codebase from scanpipe.pipes import fetch from scanpipe.pipes import filename_now +from scanpipe.pipes import input from scanpipe.pipes import make_codebase_resource from scanpipe.pipes import resolve from scanpipe.pipes import rootfs @@ -98,6 +99,25 @@ def test_scanpipe_pipes_tag_not_analyzed_codebase_resources(self): def test_scanpipe_pipes_filename_now(self): self.assertEqual("2010-10-10-10-10-10", filename_now()) + def test_scanpipe_pipes_input_get_tool_name_from_scan_headers(self): + tool_name = input.get_tool_name_from_scan_headers(scan_data={}) + self.assertIsNone(tool_name) + + tool_name = input.get_tool_name_from_scan_headers(scan_data={"headers": []}) + self.assertIsNone(tool_name) + + input_location = self.data_location / "asgiref-3.3.0_scan.json" + tool_name = input.get_tool_name_from_scan_headers( + scan_data=json.loads(input_location.read_text()) + ) + self.assertEqual("scanpipe", tool_name) + + input_location = self.data_location / "asgiref-3.3.0_scancode_scan.json" + tool_name = input.get_tool_name_from_scan_headers( + scan_data=json.loads(input_location.read_text()) + ) + self.assertEqual("scancode-toolkit", tool_name) + def test_scanpipe_pipes_scancode_extract_archive(self): target = tempfile.mkdtemp() input_location = str(self.data_location / "archive.zip") @@ -485,7 +505,7 @@ def test_scanpipe_pipes_scancode_make_results_summary(self): def test_scanpipe_pipes_scancode_assemble_packages(self): project = Project.objects.create(name="Analysis") project_scan_location = self.data_location / "package_assembly_codebase.json" - scancode.create_inventory_from_scan(project, project_scan_location) + scancode.load_inventory_from_scan(project, project_scan_location) self.assertEqual(0, project.discoveredpackages.count()) scancode.assemble_packages(project)