Skip to content

Commit

Permalink
Add support for ScanCode.io results in the LoadInventory pipeline #609
Browse files Browse the repository at this point in the history
Signed-off-by: Thomas Druez <[email protected]>
  • Loading branch information
tdruez committed Mar 3, 2023
1 parent dd9cac3 commit 0be8ff9
Show file tree
Hide file tree
Showing 12 changed files with 125 additions and 43 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ Changelog
v33.0.0 (unreleased)
--------------------

- Add support for ScanCode.io results in the "load_inventory" pipeline.
https://github.com/nexB/scancode.io/issues/609

- Add support for CycloneDX 1.4 to the "inspect-manifest" pipeline to import SBOM into
a Project.
https://github.com/nexB/scancode.io/issues/583
Expand Down
16 changes: 10 additions & 6 deletions scanpipe/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1955,6 +1955,12 @@ def _regroup_numbered_lines(numbered_lines):
for line_number, lines_group in groupby(numbered_lines, key=itemgetter(0)):
yield line_number, "".join(line for _, line in lines_group)

def add_package(self, discovered_package):
"""
Assign the `discovered_package` to this `codebase_resource` instance.
"""
self.discovered_packages.add(discovered_package)

def create_and_add_package(self, package_data):
"""
Create a DiscoveredPackage instance using the `package_data` and assigns
Expand All @@ -1976,10 +1982,8 @@ def create_and_add_package(self, package_data):
**package_data,
},
)
return

if package:
self.discovered_packages.add(package)
else:
self.add_package(package)
return package

@property
Expand Down Expand Up @@ -2020,7 +2024,7 @@ def as_spdx(self):
return spdx.File(
spdx_id=self.spdx_id,
name=f"./{self.path}",
checksums=[spdx.Checkum(algorithm="sha1", value=self.sha1)],
checksums=[spdx.Checksum(algorithm="sha1", value=self.sha1)],
license_in_files=list(set(spdx_license_keys)),
copyright_text=", ".join(copyrights),
contributors=list(set(holders + authors)),
Expand Down Expand Up @@ -2347,7 +2351,7 @@ def as_spdx(self):
Return this DiscoveredPackage as an SPDX Package entry.
"""
checksums = [
spdx.Checkum(algorithm=algorithm, value=checksum_value)
spdx.Checksum(algorithm=algorithm, value=checksum_value)
for algorithm in ["sha1", "md5"]
if (checksum_value := getattr(self, algorithm))
]
Expand Down
28 changes: 19 additions & 9 deletions scanpipe/pipelines/load_inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,18 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.

import json

from scanpipe.pipelines import Pipeline
from scanpipe.pipes import input
from scanpipe.pipes import scancode


class LoadInventory(Pipeline):
"""
A pipeline to load one or more inventory of files and packages from a ScanCode JSON
scan results. (Presumably containing resource information and package scan data).
A pipeline to load one or more inventory from ScanCode-toolkit and ScanCode.io
JSON scan results.
An inventory is composed of packages, dependencies, and resources.
"""

@classmethod
Expand All @@ -42,14 +46,20 @@ def get_scan_json_inputs(self):
Locate all the ScanCode JSON scan results from the project's input/ directory.
This includes all files with a .json extension.
"""
self.input_locations = [
str(scan_input.absolute())
for scan_input in self.project.inputs(pattern="*.json")
]
self.json_input_paths = self.project.inputs(pattern="*.json")

def build_inventory_from_scans(self):
"""
Process JSON scan results files to populate codebase resources and packages.
Process JSON scan results files to populate packages, dependencies, and
resources.
"""
for input_location in self.input_locations:
scancode.create_inventory_from_scan(self.project, input_location)
for input_path in self.json_input_paths:
scan_data = json.loads(input_path.read_text())
tool_name = input.get_tool_name_from_scan_headers(scan_data)

if tool_name == "scancode-toolkit":
scancode.load_inventory_from_scan(self.project, input_path)
elif tool_name == "scanpipe":
scancode.load_inventory_from_scanpipe(self.project, scan_data)
else:
raise Exception(f"Input not supported: {str(input_path)} ")
6 changes: 3 additions & 3 deletions scanpipe/pipelines/scan_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def steps(cls):
cls.collect_archive_information,
cls.extract_archive_to_codebase_directory,
cls.run_scancode,
cls.build_inventory_from_scan,
cls.load_inventory_from_scan,
cls.make_summary_from_scan_results,
)

Expand Down Expand Up @@ -112,11 +112,11 @@ def run_scancode(self):
if not scan_output_path.exists():
raise FileNotFoundError("ScanCode output not available.")

def build_inventory_from_scan(self):
def load_inventory_from_scan(self):
"""
Process a JSON Scan results file to populate codebase resources and packages.
"""
scancode.create_inventory_from_scan(self.project, self.scan_output_location)
scancode.load_inventory_from_scan(self.project, self.scan_output_location)

def make_summary_from_scan_results(self):
"""
Expand Down
22 changes: 21 additions & 1 deletion scanpipe/pipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,29 @@ def make_codebase_resource(project, location, **extra_fields):
codebase_resource.save(save_error=False)


def update_or_create_resource(project, resource_data):
"""
Get, update or create a CodebaseResource then return it.
"""
resource_path = resource_data.pop("path")
for_packages = resource_data.pop("for_packages", [])

codebase_resource, _ = CodebaseResource.objects.get_or_create(
project=project,
path=resource_path,
defaults=resource_data,
)

for package_uid in for_packages:
package = project.discoveredpackages.get(package_uid=package_uid)
codebase_resource.add_package(package)

return codebase_resource


def update_or_create_package(project, package_data, codebase_resource=None):
"""
Get, update or create a DiscoveredPackage then returns it.
Get, update or create a DiscoveredPackage then return it.
Use the `project` and `package_data` mapping to lookup and creates the
DiscoveredPackage using its Package URL and package_uid as a unique key.
"""
Expand Down
10 changes: 10 additions & 0 deletions scanpipe/pipes/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,13 @@ def move_inputs(inputs, dest_path):
for input_location in inputs:
destination = dest_path / Path(input_location).name
shutil.move(input_location, destination)


def get_tool_name_from_scan_headers(scan_data):
"""
Return the `tool_name` value of the first header in the provided `scan_data`.
"""
if headers := scan_data.get("headers", []):
first_header = headers[0]
tool_name = first_header.get("tool_name", "")
return tool_name
21 changes: 18 additions & 3 deletions scanpipe/pipes/scancode.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ def set_codebase_resource_for_package(codebase_resource, discovered_package):
Assign the `discovered_package` to the `codebase_resource` and set its
status to "application-package".
"""
codebase_resource.discovered_packages.add(discovered_package)
codebase_resource.add_package(discovered_package)
codebase_resource.status = "application-package"
codebase_resource.save()

Expand Down Expand Up @@ -607,9 +607,9 @@ def make_results_summary(project, scan_results_location):
return summary


def create_inventory_from_scan(project, input_location):
def load_inventory_from_scan(project, input_location):
"""
Create CodebaseResource and DiscoveredPackage instances loaded from the scan
Create packages, dependencies, and resources loaded from the ScanCode-toolkit scan
results located at `input_location`.
"""
scanned_codebase = get_virtual_codebase(project, input_location)
Expand All @@ -618,3 +618,18 @@ def create_inventory_from_scan(project, input_location):
create_discovered_dependencies(
project, scanned_codebase, strip_datafile_path_root=True
)


def load_inventory_from_scanpipe(project, scan_data):
"""
Create packages, dependencies, and resources loaded from a ScanCode.io JSON output
provided as `scan_data`.
"""
for package_data in scan_data.get("packages", []):
pipes.update_or_create_package(project, package_data)

for resource_data in scan_data.get("files", []):
pipes.update_or_create_resource(project, resource_data)

for dependency_data in scan_data.get("dependencies", []):
pipes.update_or_create_dependencies(project, dependency_data)
14 changes: 7 additions & 7 deletions scanpipe/spdx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@
version="3.3.5",
license_concluded="LicenseRef-1",
checksums=[
spdx.Checkum(algorithm="SHA1", value="10c72b88de4c5f3095ebe20b4d8afbedb32b8f"),
spdx.Checkum(algorithm="MD5", value="56770c1a2df6e0dc51c491f0a5b9d865"),
spdx.Checksum(algorithm="SHA1", value="10c72b88de4c5f3095ebe20b4d8afbedb32b8f"),
spdx.Checksum(algorithm="MD5", value="56770c1a2df6e0dc51c491f0a5b9d865"),
],
external_refs=[
spdx.ExternalRef(
Expand Down Expand Up @@ -199,7 +199,7 @@ def get_creators_dict(creators_data):


@dataclass
class Checkum:
class Checksum:
"""
The checksum provides a mechanism that can be used to verify that the contents of
a File or Package have not changed.
Expand Down Expand Up @@ -342,7 +342,7 @@ class Package:
comment: str = ""
license_comments: str = ""

checksums: List[Checkum] = field(default_factory=list)
checksums: List[Checksum] = field(default_factory=list)
external_refs: List[ExternalRef] = field(default_factory=list)
attribution_texts: List[str] = field(default_factory=list)

Expand Down Expand Up @@ -426,7 +426,7 @@ def from_data(cls, data):
license_comments=data.get("licenseComments"),
attribution_texts=data.get("attributionTexts"),
checksums=[
Checkum.from_data(checksum_data)
Checksum.from_data(checksum_data)
for checksum_data in data.get("checksums", [])
],
external_refs=[
Expand All @@ -444,7 +444,7 @@ class File:

spdx_id: str
name: str
checksums: List[Checkum] = field(default_factory=list)
checksums: List[Checksum] = field(default_factory=list)

license_concluded: str = "NOASSERTION"
copyright_text: str = "NOASSERTION"
Expand Down Expand Up @@ -490,7 +490,7 @@ def from_data(cls, data):
spdx_id=data.get("SPDXID"),
name=data.get("fileName"),
checksums=[
Checkum.from_data(checksum_data)
Checksum.from_data(checksum_data)
for checksum_data in data.get("checksums", [])
],
types=data.get("fileTypes"),
Expand Down
12 changes: 6 additions & 6 deletions scanpipe/spdx/spdx-schema-2.3.json
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
"type" : "object",
"properties" : {
"algorithm" : {
"description" : "Identifies the algorithm used to produce the subject Checkum. Currently, SHA-1 is the only supported algorithm. It is anticipated that other algorithms will be supported at a later time.",
"description" : "Identifies the algorithm used to produce the subject Checksum. Currently, SHA-1 is the only supported algorithm. It is anticipated that other algorithms will be supported at a later time.",
"type" : "string",
"enum" : [ "SHA1", "BLAKE3", "SHA3-384", "SHA256", "SHA384", "BLAKE2b-512", "BLAKE2b-256", "SHA3-512", "MD2", "ADLER32", "MD4", "SHA3-256", "BLAKE2b-384", "SHA512", "MD6", "MD5", "SHA224" ]
},
Expand All @@ -92,7 +92,7 @@
},
"required" : [ "algorithm", "checksumValue" ],
"additionalProperties" : false,
"description" : "A Checkum is value that allows the contents of a file to be authenticated. Even small changes to the content of the file will change its checksum. This class allows the results of a variety of checksum and cryptographic message digest algorithms to be represented."
"description" : "A Checksum is value that allows the contents of a file to be authenticated. Even small changes to the content of the file will change its checksum. This class allows the results of a variety of checksum and cryptographic message digest algorithms to be represented."
},
"externalDocumentId" : {
"description" : "externalDocumentId is a string containing letters, numbers, ., - and/or + which uniquely identifies an external document within this document.",
Expand Down Expand Up @@ -281,7 +281,7 @@
"type" : "object",
"properties" : {
"algorithm" : {
"description" : "Identifies the algorithm used to produce the subject Checkum. Currently, SHA-1 is the only supported algorithm. It is anticipated that other algorithms will be supported at a later time.",
"description" : "Identifies the algorithm used to produce the subject Checksum. Currently, SHA-1 is the only supported algorithm. It is anticipated that other algorithms will be supported at a later time.",
"type" : "string",
"enum" : [ "SHA1", "BLAKE3", "SHA3-384", "SHA256", "SHA384", "BLAKE2b-512", "BLAKE2b-256", "SHA3-512", "MD2", "ADLER32", "MD4", "SHA3-256", "BLAKE2b-384", "SHA512", "MD6", "MD5", "SHA224" ]
},
Expand All @@ -292,7 +292,7 @@
},
"required" : [ "algorithm", "checksumValue" ],
"additionalProperties" : false,
"description" : "A Checkum is value that allows the contents of a file to be authenticated. Even small changes to the content of the file will change its checksum. This class allows the results of a variety of checksum and cryptographic message digest algorithms to be represented."
"description" : "A Checksum is value that allows the contents of a file to be authenticated. Even small changes to the content of the file will change its checksum. This class allows the results of a variety of checksum and cryptographic message digest algorithms to be represented."
}
},
"comment" : {
Expand Down Expand Up @@ -500,7 +500,7 @@
"type" : "object",
"properties" : {
"algorithm" : {
"description" : "Identifies the algorithm used to produce the subject Checkum. Currently, SHA-1 is the only supported algorithm. It is anticipated that other algorithms will be supported at a later time.",
"description" : "Identifies the algorithm used to produce the subject Checksum. Currently, SHA-1 is the only supported algorithm. It is anticipated that other algorithms will be supported at a later time.",
"type" : "string",
"enum" : [ "SHA1", "BLAKE3", "SHA3-384", "SHA256", "SHA384", "BLAKE2b-512", "BLAKE2b-256", "SHA3-512", "MD2", "ADLER32", "MD4", "SHA3-256", "BLAKE2b-384", "SHA512", "MD6", "MD5", "SHA224" ]
},
Expand All @@ -511,7 +511,7 @@
},
"required" : [ "algorithm", "checksumValue" ],
"additionalProperties" : false,
"description" : "A Checkum is value that allows the contents of a file to be authenticated. Even small changes to the content of the file will change its checksum. This class allows the results of a variety of checksum and cryptographic message digest algorithms to be represented."
"description" : "A Checksum is value that allows the contents of a file to be authenticated. Even small changes to the content of the file will change its checksum. This class allows the results of a variety of checksum and cryptographic message digest algorithms to be represented."
}
},
"comment" : {
Expand Down
12 changes: 6 additions & 6 deletions scanpipe/spdx/test_spdx.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ def setUp(self):
"license_concluded": "LicenseRef-1",
"release_date": "2000-01-01",
"checksums": [
spdx.Checkum(**self.checksum_sha1_data),
spdx.Checkum(**self.checksum_md5_data),
spdx.Checksum(**self.checksum_sha1_data),
spdx.Checksum(**self.checksum_md5_data),
],
"external_refs": [spdx.ExternalRef(**self.external_ref_purl_data)],
}
Expand Down Expand Up @@ -135,7 +135,7 @@ def setUp(self):
"name": "file.txt",
"license_concluded": "LicenseRef-1",
"checksums": [
spdx.Checkum(**self.checksum_sha1_data),
spdx.Checksum(**self.checksum_sha1_data),
],
"types": ["TEXT"],
"comment": "comment",
Expand Down Expand Up @@ -282,12 +282,12 @@ def test_spdx_creation_info_missing_data(self):
assert "Missing values to build `creators` list." == str(error.exception)

def test_spdx_checksum_as_dict(self):
checksum = spdx.Checkum(**self.checksum_sha1_data)
checksum = spdx.Checksum(**self.checksum_sha1_data)
assert self.checksum_sha1_spdx_data == checksum.as_dict()

def test_spdx_checksum_from_data(self):
assert spdx.Checkum.from_data({})
checksum = spdx.Checkum.from_data(self.checksum_sha1_spdx_data)
assert spdx.Checksum.from_data({})
checksum = spdx.Checksum.from_data(self.checksum_sha1_spdx_data)
assert self.checksum_sha1_spdx_data == checksum.as_dict()

def test_spdx_external_ref_as_dict(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
{
"pipeline_name": "load_inventory",
"status": "not_started",
"description": "A pipeline to load one or more inventory of files and packages from a ScanCode JSON\nscan results. (Presumably containing resource information and package scan data).",
"description": "A pipeline to load one or more inventory from ScanCode-toolkit and ScanCode.io\nJSON scan results.\nAn inventory is composed of packages, dependencies, and resources.",
"scancodeio_version": "",
"task_id": null,
"task_start_date": null,
Expand Down
Loading

0 comments on commit 0be8ff9

Please sign in to comment.