diff --git a/scanpipe/pipelines/populate_purldb.py b/scanpipe/pipelines/populate_purldb.py index b525fcf54..5e28469b9 100644 --- a/scanpipe/pipelines/populate_purldb.py +++ b/scanpipe/pipelines/populate_purldb.py @@ -22,6 +22,7 @@ from scanpipe.pipelines import Pipeline from scanpipe.pipes import purldb +from scanpipe.pipes import scancode class PopulatePurlDB(Pipeline): @@ -32,6 +33,7 @@ def steps(cls): return ( cls.populate_purldb_with_discovered_packages, cls.populate_purldb_with_discovered_dependencies, + cls.populate_purldb_with_detected_purls, ) def populate_purldb_with_discovered_packages(self): @@ -48,12 +50,31 @@ def populate_purldb_with_discovered_dependencies(self): package_type="DiscoveredDependency", ) + def populate_purldb_with_detected_purls(self): + """Add DiscoveredPackage to PurlDB.""" + no_packages_and_no_dependencies = all( + [ + not self.project.discoveredpackages.exists(), + not self.project.discovereddependencies.exists(), + ] + ) + # Even when there are no packages/dependencies, resource level + # package data could be detected (i.e. when we detect packages, + # but skip the assembly step that creates + # package/dependency instances) + if no_packages_and_no_dependencies: + packages = scancode.get_packages_with_purl_from_resources(self.project) + self.feed_purldb( + packages=list(packages), + package_type="DiscoveredPackage", + ) + def feed_purldb(self, packages, package_type): """Feed PurlDB with list of PURLs for indexing.""" if not purldb.is_available(): raise Exception("PurlDB is not available.") - package_urls = [package.purl for package in packages] + package_urls = list(set([package.purl for package in packages])) self.log(f"Populating PurlDB with {len(package_urls):,d} {package_type}") response = purldb.submit_purls(purls=package_urls) diff --git a/scanpipe/pipelines/scan_codebase_packages.py b/scanpipe/pipelines/scan_codebase_packages.py index fb184599c..d9924eff9 100644 --- a/scanpipe/pipelines/scan_codebase_packages.py +++ b/scanpipe/pipelines/scan_codebase_packages.py @@ -21,10 +21,15 @@ # Visit https://github.com/nexB/scancode.io for support and download. from scanpipe.pipelines.scan_codebase import ScanCodebase +from scanpipe.pipes import scancode class ScanCodebasePackages(ScanCodebase): - """Scan a codebase for packages only.""" + """ + Scan a codebase for package data only for the purpose of getting + all purls (without creating package/dependency instances by + package assembly). + """ @classmethod def steps(cls): @@ -36,3 +41,10 @@ def steps(cls): cls.flag_ignored_resources, cls.scan_for_application_packages, ) + + def scan_for_application_packages(self): + """Scan unknown resources for packages information.""" + # `assemble` is set to False because here in this pipeline we + # only detect package_data in resources without creating + # Package/Dependency instances, to get all the purls from a codebase. + scancode.scan_for_application_packages(self.project, assemble=False) diff --git a/scanpipe/pipes/scancode.py b/scanpipe/pipes/scancode.py index e23a3906c..1a4323a1f 100644 --- a/scanpipe/pipes/scancode.py +++ b/scanpipe/pipes/scancode.py @@ -343,11 +343,16 @@ def scan_for_files(project, resource_qs=None, progress_logger=None): ) -def scan_for_application_packages(project, progress_logger=None): +def scan_for_application_packages(project, assemble=True, progress_logger=None): """ - Run a package scan on files without a status for a `project`, - then create DiscoveredPackage and DiscoveredDependency instances - from the detected package data + Run a package scan on resources without a status for a `project`, + and add them in their respective `package_data` attribute. + Then create DiscoveredPackage and DiscoveredDependency instances + from the detected package data optionally. If the `assemble` argument + is set to `True`, DiscoveredPackage and DiscoveredDependency instances + are created and added to the project by assembling resource level + package_data, and resources which belong in the DiscoveredPackage + instance, are assigned to that package. Multiprocessing is enabled by default on this pipe, the number of processes can be controlled through the SCANCODEIO_PROCESSES setting. @@ -365,7 +370,8 @@ def scan_for_application_packages(project, progress_logger=None): # Iterate through CodebaseResources with Package data and handle them using # the proper Package handler from packagedcode. - assemble_packages(project=project) + if assemble: + assemble_packages(project=project) def add_resource_to_package(package_uid, resource, project): @@ -436,6 +442,24 @@ def assemble_packages(project): logger.info(f"Unknown Package assembly item type: {item!r}") +def get_packages_with_purl_from_resources(project): + """ + Yield Dependency or PackageData objects created from detected package_data + in all the project resources. Both Dependency and PackageData objects have + the `purl` attribute with a valid purl. + """ + for resource in project.codebaseresources.has_package_data(): + for package_mapping in resource.package_data: + for dependency in package_mapping.get("dependencies"): + yield packagedcode_models.Dependency.from_dependent_package( + dependent_package=dependency, + datafile_path=resource.path, + datasource_id=package_mapping.get("datasource_id"), + package_uid=None, + ) + yield packagedcode_models.PackageData.from_dict(mapping=package_mapping) + + def get_pretty_params(args): """Format provided ``args`` for the ``pretty_params`` run_scan argument.""" return {f"--{key.replace('_', '-')}": value for key, value in args.items()} diff --git a/scanpipe/tests/pipes/test_scancode.py b/scanpipe/tests/pipes/test_scancode.py index 0d625a7be..9632cbd97 100644 --- a/scanpipe/tests/pipes/test_scancode.py +++ b/scanpipe/tests/pipes/test_scancode.py @@ -397,6 +397,23 @@ def test_scanpipe_pipes_scancode_virtual_codebase(self): self.assertEqual(1, DiscoveredPackage.objects.count()) self.assertEqual(1, DiscoveredDependency.objects.count()) + def test_scanpipe_pipes_scancode_get_packages_with_purl_from_resources(self): + project = Project.objects.create(name="Analysis") + filename = "package_assembly_codebase.json" + project_scan_location = self.data_location / "scancode" / filename + input.load_inventory_from_toolkit_scan(project, project_scan_location) + + project.discoveredpackages.all().delete() + self.assertEqual(0, project.discoveredpackages.count()) + + packages = list(scancode.get_packages_with_purl_from_resources(project)) + + package_purl_exists = [True for package in packages if package.purl] + package_purls = [package.purl for package in packages] + self.assertTrue(package_purl_exists) + self.assertEqual(len(package_purl_exists), 1) + self.assertTrue("pkg:npm/test@0.1.0" in package_purls) + def test_scanpipe_pipes_scancode_run_scancode(self): project = Project.objects.create(name="name with space") output = scancode.run_scan( diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index 55a83408c..bc4ce2bdb 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -445,6 +445,24 @@ def test_scanpipe_scan_codebase_pipeline_integration_test(self): expected_file = self.data_location / "is-npm-1.0.0_scan_codebase.json" self.assertPipelineResultEqual(expected_file, result_file) + def test_scanpipe_scan_codebase_packages_does_not_create_packages(self): + pipeline_name = "scan_codebase_packages" + project1 = Project.objects.create(name="Analysis") + + filename = "is-npm-1.0.0.tgz" + input_location = self.data_location / filename + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(6, project1.codebaseresources.count()) + self.assertEqual(0, project1.discoveredpackages.count()) + self.assertEqual(0, project1.discovereddependencies.count()) + def test_scanpipe_scan_codebase_can_process_wheel(self): pipeline_name = "scan_codebase" project1 = Project.objects.create(name="Analysis") @@ -956,9 +974,9 @@ def mock_request_post_return(url, data, timeout): exitcode, out = pipeline.execute() self.assertEqual(0, exitcode, msg=out) - self.assertIn("Populating PurlDB with 2 DiscoveredPackage", run.log) - self.assertIn("Successfully queued 2 PURLs for indexing in PurlDB", run.log) + self.assertIn("Populating PurlDB with 1 DiscoveredPackage", run.log) + self.assertIn("Successfully queued 1 PURLs for indexing in PurlDB", run.log) self.assertIn("1 PURLs were already present in PurlDB index queue", run.log) self.assertIn("Couldn't index 1 unsupported PURLs", run.log) - self.assertIn("Populating PurlDB with 4 DiscoveredDependency", run.log) - self.assertIn("Successfully queued 4 PURLs for indexing in PurlDB", run.log) + self.assertIn("Populating PurlDB with 2 DiscoveredDependency", run.log) + self.assertIn("Successfully queued 2 PURLs for indexing in PurlDB", run.log)