Skip to content

Commit

Permalink
Modify pipelines to get purls from package_data (#904)
Browse files Browse the repository at this point in the history
* Modify pipelines to get purls from package_data

Signed-off-by: Ayan Sinha Mahapatra <[email protected]>

* Only get purls from package data if no packages

Signed-off-by: Ayan Sinha Mahapatra <[email protected]>

* Add docstrings and tests

Signed-off-by: Ayan Sinha Mahapatra <[email protected]>

* Only submit unique purls

Signed-off-by: Ayan Sinha Mahapatra <[email protected]>

* Add docstrings and comments from feedback

Signed-off-by: Ayan Sinha Mahapatra <[email protected]>

* Update test expectations

Updates test expectations which after modifying populate
purldb pipeline to only send unique purls.

Signed-off-by: Ayan Sinha Mahapatra <[email protected]>

* Address review comments

Signed-off-by: Ayan Sinha Mahapatra <[email protected]>

---------

Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
  • Loading branch information
AyanSinhaMahapatra authored Sep 11, 2023
1 parent b50763a commit ade2953
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 11 deletions.
23 changes: 22 additions & 1 deletion scanpipe/pipelines/populate_purldb.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

from scanpipe.pipelines import Pipeline
from scanpipe.pipes import purldb
from scanpipe.pipes import scancode


class PopulatePurlDB(Pipeline):
Expand All @@ -32,6 +33,7 @@ def steps(cls):
return (
cls.populate_purldb_with_discovered_packages,
cls.populate_purldb_with_discovered_dependencies,
cls.populate_purldb_with_detected_purls,
)

def populate_purldb_with_discovered_packages(self):
Expand All @@ -48,12 +50,31 @@ def populate_purldb_with_discovered_dependencies(self):
package_type="DiscoveredDependency",
)

def populate_purldb_with_detected_purls(self):
"""Add DiscoveredPackage to PurlDB."""
no_packages_and_no_dependencies = all(
[
not self.project.discoveredpackages.exists(),
not self.project.discovereddependencies.exists(),
]
)
# Even when there are no packages/dependencies, resource level
# package data could be detected (i.e. when we detect packages,
# but skip the assembly step that creates
# package/dependency instances)
if no_packages_and_no_dependencies:
packages = scancode.get_packages_with_purl_from_resources(self.project)
self.feed_purldb(
packages=list(packages),
package_type="DiscoveredPackage",
)

def feed_purldb(self, packages, package_type):
"""Feed PurlDB with list of PURLs for indexing."""
if not purldb.is_available():
raise Exception("PurlDB is not available.")

package_urls = [package.purl for package in packages]
package_urls = list(set([package.purl for package in packages]))
self.log(f"Populating PurlDB with {len(package_urls):,d} {package_type}")

response = purldb.submit_purls(purls=package_urls)
Expand Down
14 changes: 13 additions & 1 deletion scanpipe/pipelines/scan_codebase_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,15 @@
# Visit https://github.com/nexB/scancode.io for support and download.

from scanpipe.pipelines.scan_codebase import ScanCodebase
from scanpipe.pipes import scancode


class ScanCodebasePackages(ScanCodebase):
"""Scan a codebase for packages only."""
"""
Scan a codebase for package data only for the purpose of getting
all purls (without creating package/dependency instances by
package assembly).
"""

@classmethod
def steps(cls):
Expand All @@ -36,3 +41,10 @@ def steps(cls):
cls.flag_ignored_resources,
cls.scan_for_application_packages,
)

def scan_for_application_packages(self):
"""Scan unknown resources for packages information."""
# `assemble` is set to False because here in this pipeline we
# only detect package_data in resources without creating
# Package/Dependency instances, to get all the purls from a codebase.
scancode.scan_for_application_packages(self.project, assemble=False)
34 changes: 29 additions & 5 deletions scanpipe/pipes/scancode.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,11 +343,16 @@ def scan_for_files(project, resource_qs=None, progress_logger=None):
)


def scan_for_application_packages(project, progress_logger=None):
def scan_for_application_packages(project, assemble=True, progress_logger=None):
"""
Run a package scan on files without a status for a `project`,
then create DiscoveredPackage and DiscoveredDependency instances
from the detected package data
Run a package scan on resources without a status for a `project`,
and add them in their respective `package_data` attribute.
Then create DiscoveredPackage and DiscoveredDependency instances
from the detected package data optionally. If the `assemble` argument
is set to `True`, DiscoveredPackage and DiscoveredDependency instances
are created and added to the project by assembling resource level
package_data, and resources which belong in the DiscoveredPackage
instance, are assigned to that package.
Multiprocessing is enabled by default on this pipe, the number of processes can be
controlled through the SCANCODEIO_PROCESSES setting.
Expand All @@ -365,7 +370,8 @@ def scan_for_application_packages(project, progress_logger=None):

# Iterate through CodebaseResources with Package data and handle them using
# the proper Package handler from packagedcode.
assemble_packages(project=project)
if assemble:
assemble_packages(project=project)


def add_resource_to_package(package_uid, resource, project):
Expand Down Expand Up @@ -436,6 +442,24 @@ def assemble_packages(project):
logger.info(f"Unknown Package assembly item type: {item!r}")


def get_packages_with_purl_from_resources(project):
"""
Yield Dependency or PackageData objects created from detected package_data
in all the project resources. Both Dependency and PackageData objects have
the `purl` attribute with a valid purl.
"""
for resource in project.codebaseresources.has_package_data():
for package_mapping in resource.package_data:
for dependency in package_mapping.get("dependencies"):
yield packagedcode_models.Dependency.from_dependent_package(
dependent_package=dependency,
datafile_path=resource.path,
datasource_id=package_mapping.get("datasource_id"),
package_uid=None,
)
yield packagedcode_models.PackageData.from_dict(mapping=package_mapping)


def get_pretty_params(args):
"""Format provided ``args`` for the ``pretty_params`` run_scan argument."""
return {f"--{key.replace('_', '-')}": value for key, value in args.items()}
Expand Down
17 changes: 17 additions & 0 deletions scanpipe/tests/pipes/test_scancode.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,23 @@ def test_scanpipe_pipes_scancode_virtual_codebase(self):
self.assertEqual(1, DiscoveredPackage.objects.count())
self.assertEqual(1, DiscoveredDependency.objects.count())

def test_scanpipe_pipes_scancode_get_packages_with_purl_from_resources(self):
project = Project.objects.create(name="Analysis")
filename = "package_assembly_codebase.json"
project_scan_location = self.data_location / "scancode" / filename
input.load_inventory_from_toolkit_scan(project, project_scan_location)

project.discoveredpackages.all().delete()
self.assertEqual(0, project.discoveredpackages.count())

packages = list(scancode.get_packages_with_purl_from_resources(project))

package_purl_exists = [True for package in packages if package.purl]
package_purls = [package.purl for package in packages]
self.assertTrue(package_purl_exists)
self.assertEqual(len(package_purl_exists), 1)
self.assertTrue("pkg:npm/[email protected]" in package_purls)

def test_scanpipe_pipes_scancode_run_scancode(self):
project = Project.objects.create(name="name with space")
output = scancode.run_scan(
Expand Down
26 changes: 22 additions & 4 deletions scanpipe/tests/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,24 @@ def test_scanpipe_scan_codebase_pipeline_integration_test(self):
expected_file = self.data_location / "is-npm-1.0.0_scan_codebase.json"
self.assertPipelineResultEqual(expected_file, result_file)

def test_scanpipe_scan_codebase_packages_does_not_create_packages(self):
pipeline_name = "scan_codebase_packages"
project1 = Project.objects.create(name="Analysis")

filename = "is-npm-1.0.0.tgz"
input_location = self.data_location / filename
project1.copy_input_from(input_location)

run = project1.add_pipeline(pipeline_name)
pipeline = run.make_pipeline_instance()

exitcode, out = pipeline.execute()
self.assertEqual(0, exitcode, msg=out)

self.assertEqual(6, project1.codebaseresources.count())
self.assertEqual(0, project1.discoveredpackages.count())
self.assertEqual(0, project1.discovereddependencies.count())

def test_scanpipe_scan_codebase_can_process_wheel(self):
pipeline_name = "scan_codebase"
project1 = Project.objects.create(name="Analysis")
Expand Down Expand Up @@ -956,9 +974,9 @@ def mock_request_post_return(url, data, timeout):
exitcode, out = pipeline.execute()
self.assertEqual(0, exitcode, msg=out)

self.assertIn("Populating PurlDB with 2 DiscoveredPackage", run.log)
self.assertIn("Successfully queued 2 PURLs for indexing in PurlDB", run.log)
self.assertIn("Populating PurlDB with 1 DiscoveredPackage", run.log)
self.assertIn("Successfully queued 1 PURLs for indexing in PurlDB", run.log)
self.assertIn("1 PURLs were already present in PurlDB index queue", run.log)
self.assertIn("Couldn't index 1 unsupported PURLs", run.log)
self.assertIn("Populating PurlDB with 4 DiscoveredDependency", run.log)
self.assertIn("Successfully queued 4 PURLs for indexing in PurlDB", run.log)
self.assertIn("Populating PurlDB with 2 DiscoveredDependency", run.log)
self.assertIn("Successfully queued 2 PURLs for indexing in PurlDB", run.log)

0 comments on commit ade2953

Please sign in to comment.