Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update debian package manifest parsing #3647

Merged
merged 8 commits into from
Feb 19, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ commoncode==31.0.3
construct==2.10.68
container-inspector==31.1.0
cryptography==37.0.4
debian-inspector==31.0.0
debian-inspector==31.1.0
dockerfile-parse==1.2.0
dparse2==0.7.0
extractcode==31.0.0
Expand Down
2 changes: 1 addition & 1 deletion setup-mini.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ install_requires =
colorama >= 0.3.9
commoncode >= 31.0.2
container-inspector >= 31.0.0
debian-inspector >= 31.0.0
debian-inspector >= 31.1.0
dparse2 >= 0.7.0
fasteners
fingerprints >= 0.6.0
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ install_requires =
colorama >= 0.3.9
commoncode >= 31.0.3
container-inspector >= 31.0.0
debian-inspector >= 31.0.0
debian-inspector >= 31.1.0
dparse2 >= 0.7.0
fasteners
fingerprints >= 0.6.0
Expand Down
8 changes: 6 additions & 2 deletions src/packagedcode/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,13 +208,17 @@
debian_copyright.DebianCopyrightFileInPackageHandler,
debian_copyright.DebianCopyrightFileInSourceHandler,

# TODO: consider activating? debian_copyright.StandaloneDebianCopyrightFileHandler,

debian.DebianDistrolessInstalledDatabaseHandler,

debian.DebianInstalledFilelistHandler,
debian.DebianInstalledMd5sumFilelistHandler,
debian.DebianInstalledStatusDatabaseHandler,
debian.DebianControlFileInSourceHandler,
debian.DebianDscFileHandler,
debian.DebianSourcePackageTarballHandler,
debian.DebianSourcePackageMetadataTarballHandler,
debian.DebianDebPackageHandler,
debian_copyright.StandaloneDebianCopyrightFileHandler
]

if on_linux:
Expand Down
122 changes: 102 additions & 20 deletions src/packagedcode/debian.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import os
import logging
from collections import Counter
from pathlib import Path

from commoncode import fileutils
Expand Down Expand Up @@ -137,6 +138,7 @@ def parse(cls, location):
debian_data=get_paragraph_data_from_file(location=location),
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
distro='debian',
)

@classmethod
Expand All @@ -157,15 +159,19 @@ class DebianControlFileInSourceHandler(models.DatafileHandler):

@classmethod
def parse(cls, location):
# TODO: we cannot know the distro from the name only
# NOTE: a control file in a source repo or debina.tar tarball can contain more than one package
debian_packages = []
for debian_data in get_paragraphs_data_from_file(location=location):
yield build_package_data(
debian_data,
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
debian_packages.append(
build_package_data(
debian_data=debian_data,
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
)
)

yield from populate_debian_namespace(debian_packages)

@classmethod
def assign_package_to_resources(cls, package, resource, codebase, package_adder):
# two levels up
Expand All @@ -191,11 +197,19 @@ def parse(cls, location):
location=location,
remove_pgp_signature=True,
)
yield build_package_data(

package_data_from_file = build_package_data_from_package_filename(
filename=os.path.basename(location),
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
)
package_data = build_package_data(
debian_data=debian_data,
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
)
package_data.update_purl_fields(package_data=package_data_from_file)
yield package_data

@classmethod
def assign_package_to_resources(cls, package, resource, codebase, package_adder):
Expand All @@ -214,13 +228,18 @@ class DebianInstalledStatusDatabaseHandler(models.DatafileHandler):
def parse(cls, location):
# note that we do not know yet the distro at this stage
# we could get it... but we get that later during assemble()
for debian_data in get_paragraphs_data_from_file(location):
yield build_package_data(
debian_data,
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
debian_packages = []
for debian_data in get_paragraphs_data_from_file(location=location):
debian_packages.append(
build_package_data(
debian_data=debian_data,
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
)
)

yield from populate_debian_namespace(debian_packages)

@classmethod
def assemble(cls, package_data, resource, codebase, package_adder):
# get the root resource of the rootfs
Expand Down Expand Up @@ -260,7 +279,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):

# We only need to adjust the md5sum/list path in the case of `same`
qualifiers = package_data.qualifiers or {}
architecture = qualifiers.get('architecture')
architecture = qualifiers.get('arch')

multi_arch = package_data.extra_data.get('multi_arch')

Expand Down Expand Up @@ -305,6 +324,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):
package.update(
package_data=package_data,
datafile_path=res.path,
check_compatible=False,
replace=False,
include_version=False,
include_qualifiers=False,
Expand Down Expand Up @@ -379,14 +399,18 @@ def parse(cls, location):
rootfs installation. distroless is derived from Debian but each package
has its own status file.
"""
for debian_data in get_paragraphs_data_from_file(location):
yield build_package_data(
debian_data,
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
distro='distroless',
debian_packages = []
for debian_data in get_paragraphs_data_from_file(location=location):
debian_packages.append(
build_package_data(
debian_data=debian_data,
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
)
)

yield from populate_debian_namespace(debian_packages)

@classmethod
def assemble(cls, package_data, resource, codebase, package_adder):
# get the root resource of the rootfs
Expand Down Expand Up @@ -523,6 +547,9 @@ def build_package_data_from_package_filename(filename, datasource_id, package_ty
"""

# TODO: we cannot know the distro from the name only
# PURLs without namespace is invalid, so we need to
# have a default value for this
distro = 'debian'
deb = DebArchive.from_filename(filename=filename)

if deb.architecture:
Expand All @@ -538,6 +565,7 @@ def build_package_data_from_package_filename(filename, datasource_id, package_ty
datasource_id=datasource_id,
type=package_type,
name=deb.name,
namespace=distro,
version=version,
qualifiers=qualifiers,
)
Expand Down Expand Up @@ -598,7 +626,7 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No
qualifiers = {}
architecture = debian_data.get('architecture')
if architecture:
qualifiers['architecture'] = architecture
qualifiers['arch'] = architecture

extra_data = {}
# Multi-Arch can be: "foreign", "same", "allowed", "all", "optional" or
Expand Down Expand Up @@ -628,13 +656,27 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No
if keyword:
keywords.append(keyword)

# Get distro/namespace information from clues in package data
if not distro:
if version:
for clue, namespace in version_clues_for_namespace.items():
if clue in version:
distro = namespace
break

if maintainer:
for clue, namespace in maintainer_clues_for_namespace.items():
if clue in maintainer:
distro = namespace
break

source_packages = []
source = debian_data.get('source')
if source:
source_pkg_purl = PackageURL(
type=package_type,
name=source,
namespace=distro
namespace=distro,
).to_string()

source_packages.append(source_pkg_purl)
Expand All @@ -656,6 +698,46 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No
)


def populate_debian_namespace(packages):
"""
For an iterable of debian `packages`, populate the
most frequently occuring namespace, or the default
namespace 'debian' in packages without namespace.
"""
if not packages:
return

namespaces_with_count = Counter([
package.namespace
for package in packages
])
distro = max(namespaces_with_count, key=namespaces_with_count.get)
if not distro:
distro = 'debian'

for package in packages:
if not package.namespace:
package.namespace = distro
yield package


version_clues_for_namespace = {
'deb': 'debian',
'ubuntu': 'ubuntu',
}


maintainer_clues_for_namespace = {
'packages.debian.org': 'debian',
'lists.debian.org': 'debian',
'lists.alioth.debian.org': 'debian',
'@debian.org': 'debian',
'debian-init-diversity@': 'debian',
'lists.ubuntu.com': 'ubuntu',
'@canonical.com': 'ubuntu',
}


ignored_root_dirs = {
'/.',
'/bin',
Expand Down
56 changes: 56 additions & 0 deletions src/packagedcode/debian_copyright.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from debian_inspector.copyright import CopyrightLicenseParagraph
from debian_inspector.copyright import CopyrightHeaderParagraph
from debian_inspector.copyright import DebianCopyright
from debian_inspector.package import CodeMetadata
from debian_inspector.version import Version as DebVersion
from license_expression import ExpressionError
from license_expression import LicenseSymbolLike
from license_expression import Licensing
Expand Down Expand Up @@ -263,11 +265,65 @@ class StandaloneDebianCopyrightFileHandler(BaseDebianCopyrightFileHandler):
'*_copyright',
)

@classmethod
def is_datafile(cls, location, filetypes=tuple()):
return (
super().is_datafile(location, filetypes=filetypes)
and not DebianCopyrightFileInPackageHandler.is_datafile(location)
and not DebianCopyrightFileInSourceHandler.is_datafile(location)
)

@classmethod
def assemble(cls, package_data, resource, codebase, package_adder):
# assemble is the default
yield from super().assemble(package_data, resource, codebase, package_adder)

@classmethod
def parse(cls, location):
"""
Gets license/copyright information from file like
other copyright files, but also gets purl fields if
present in copyright filename, if obtained from
upstream metadata archive.
"""
package_data = list(super().parse(location)).pop()
package_data_from_file = build_package_data_from_metadata_filename(
filename=os.path.basename(location),
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
)
if package_data_from_file:
package_data.update_purl_fields(package_data=package_data_from_file)

yield package_data


def build_package_data_from_metadata_filename(filename, datasource_id, package_type):
"""
Return a PackageData built from the filename of a Debian package metadata.
"""

# TODO: we cannot know the distro from the name only
# PURLs without namespace is invalid, so we need to
# have a default value for this
distro = 'debian'
try:
deb = CodeMetadata.from_filename(filename=filename)
except ValueError:
return

version = deb.version
if isinstance(version, DebVersion):
version = str(version)

return models.PackageData(
datasource_id=datasource_id,
type=package_type,
name=deb.name,
namespace=distro,
version=version,
)


class NotReallyStructuredCopyrightFile(Exception):
"""
Expand Down
Loading
Loading