Skip to content

Commit

Permalink
Refactor java to class matching #659
Browse files Browse the repository at this point in the history
Signed-off-by: Thomas Druez <[email protected]>
  • Loading branch information
tdruez committed Apr 11, 2023
1 parent faba7cf commit 73b99be
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 21 deletions.
91 changes: 72 additions & 19 deletions scanpipe/pipes/d2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.

from collections import defaultdict
from pathlib import Path

from scanpipe import pipes
Expand All @@ -31,8 +32,7 @@

def checksum_match(project, checksum_field):
"""Match using checksum."""
project_files = project.codebaseresources.files().has_no_relation().not_empty()

project_files = project.codebaseresources.files().not_empty()
from_resources = project_files.from_codebase().has_value(checksum_field)
to_resources = project_files.to_codebase().has_value(checksum_field)

Expand All @@ -48,54 +48,107 @@ def checksum_match(project, checksum_field):
)


def count_similar_segments_reverse(path1, path2):
"""
Count the number of similar path segments between two paths,
starting from the rightmost segment.
"""
segments1 = path1.split("/")
segments2 = path2.split("/")
count = 0

while segments1 and segments2 and segments1[-1] == segments2[-1]:
count += 1
segments1.pop()
segments2.pop()

return count


def java_to_class_match(project):
"""Match a .java source to its compiled .class"""
from_extension = ".java"
to_extension = ".class"

project_files = project.codebaseresources.files().has_no_relation()
from_resources = project_files.from_codebase()
project_files = project.codebaseresources.files()
from_resources = project_files.from_codebase().has_no_relation()
to_resources = project_files.to_codebase()

for resource in from_resources.filter(extension=from_extension):
parts = resource.path[len(FROM) : -len(from_extension)]
matches = to_resources.filter(path=f"{TO}{parts}{to_extension}")
for match in matches:
for resource in from_resources.filter(name__endswith=from_extension):
to_name = resource.name.replace(from_extension, to_extension)
name_matches = to_resources.filter(name=to_name)
path_parts = Path(resource.path.lstrip("/")).parts

match_by_similarity_count = defaultdict(list)
for match in name_matches:
path1 = "/".join(resource.path.split("/")[:-1])
path2 = "/".join(match.path.split("/")[:-1])

similarity_count = count_similar_segments_reverse(path1, path2)
match_by_similarity_count[similarity_count].append(match)

if not match_by_similarity_count:
continue

max_similarity_count = max(match_by_similarity_count.keys())
best_matches = match_by_similarity_count[max_similarity_count]
for match in best_matches:
pipes.make_relationship(
from_resource=resource,
to_resource=match,
relationship=CodebaseRelation.Relationship.COMPILED_TO,
match_type="java_to_class",
extra_data={
"path_score": f"{max_similarity_count + 1}/{len(path_parts) - 1}",
},
)


# TODO: Remove duplication with java_to_class_match
def java_to_inner_class_match(project):
"""Match a .java source to compiled $.class"""
from_extension = ".java"
to_extension = ".class"

project_files = project.codebaseresources.files().has_no_relation()
project_files = project.codebaseresources.files()
from_resources = project_files.from_codebase()
to_resources = project_files.to_codebase()
to_resources = project_files.to_codebase().has_no_relation()

inner_classes = to_resources.filter(name__contains="$", extension=to_extension)
for to_resource in inner_classes:
parts = to_resource.path[len(TO) : -len(to_extension)]
source_java = "/".join(parts.split("/")[:-1] + to_resource.name.split("$")[:1])
matches = from_resources.filter(path=f"{FROM}{source_java}{from_extension}")
for match in matches:
inner_classes = to_resources.filter(name__contains="$", name__endswith=to_extension)
for resource in inner_classes:
from_name = resource.name.split("$")[0] + from_extension
name_matches = from_resources.filter(name=from_name)
path_parts = Path(resource.path.lstrip("/")).parts

match_by_similarity_count = defaultdict(list)
for match in name_matches:
path1 = "/".join(resource.path.split("/")[:-1])
path2 = "/".join(match.path.split("/")[:-1])

similarity_count = count_similar_segments_reverse(path1, path2)
match_by_similarity_count[similarity_count].append(match)

if not match_by_similarity_count:
continue

max_similarity_count = max(match_by_similarity_count.keys())
best_matches = match_by_similarity_count[max_similarity_count]
for match in best_matches:
pipes.make_relationship(
from_resource=match,
to_resource=to_resource,
to_resource=resource,
relationship=CodebaseRelation.Relationship.COMPILED_TO,
match_type="java_to_class",
extra_data={
"path_score": f"{max_similarity_count + 1}/{len(path_parts) - 1}",
},
)


def path_match(project):
"""Match using path similarities."""
project_files = project.codebaseresources.files().has_no_relation()
from_resources = project_files.from_codebase()
project_files = project.codebaseresources.files()
from_resources = project_files.from_codebase().has_no_relation()
to_resources = project_files.to_codebase()

for resource in from_resources:
Expand Down
8 changes: 6 additions & 2 deletions scanpipe/templates/scanpipe/codebase_relation.html
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,10 @@
{% endif %}
<td>
{{ relation.match_type }}
{% if relation.extra_data.path_score %}
{{ relation.extra_data.path_score }}
{% endif %}
{% if relation.match_type == "path" %}
(score: {{ relation.extra_data.path_score }})
<a href="/project/{{ project.uuid }}/resources/diff/?pk_a={{ resource.path }}&pk_b={{ relation.to_resource.path }}" target="_blank">diff</a>
{% endif %}
</td>
Expand Down Expand Up @@ -97,8 +99,10 @@
{% endif %}
<td>
{{ relation.match_type }}
{% if relation.extra_data.path_score %}
{{ relation.extra_data.path_score }}
{% endif %}
{% if relation.match_type == "path" %}
(score: {{ relation.extra_data.path_score }})
<a href="/project/{{ project.uuid }}/resources/diff/?pk_a={{ resource.path }}&pk_b={{ relation.from_resource.path }}" target="_blank">diff</a>
{% endif %}
</td>
Expand Down

0 comments on commit 73b99be

Please sign in to comment.