From 73b99be376e4fc791d352d8c0a9c85fdd25cd685 Mon Sep 17 00:00:00 2001 From: Thomas Druez Date: Tue, 11 Apr 2023 15:56:27 +0400 Subject: [PATCH] Refactor java to class matching #659 Signed-off-by: Thomas Druez --- scanpipe/pipes/d2d.py | 91 +++++++++++++++---- .../templates/scanpipe/codebase_relation.html | 8 +- 2 files changed, 78 insertions(+), 21 deletions(-) diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index 05d92bece..4be111c6f 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -20,6 +20,7 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/nexB/scancode.io for support and download. +from collections import defaultdict from pathlib import Path from scanpipe import pipes @@ -31,8 +32,7 @@ def checksum_match(project, checksum_field): """Match using checksum.""" - project_files = project.codebaseresources.files().has_no_relation().not_empty() - + project_files = project.codebaseresources.files().not_empty() from_resources = project_files.from_codebase().has_value(checksum_field) to_resources = project_files.to_codebase().has_value(checksum_field) @@ -48,54 +48,107 @@ def checksum_match(project, checksum_field): ) +def count_similar_segments_reverse(path1, path2): + """ + Count the number of similar path segments between two paths, + starting from the rightmost segment. + """ + segments1 = path1.split("/") + segments2 = path2.split("/") + count = 0 + + while segments1 and segments2 and segments1[-1] == segments2[-1]: + count += 1 + segments1.pop() + segments2.pop() + + return count + + def java_to_class_match(project): """Match a .java source to its compiled .class""" from_extension = ".java" to_extension = ".class" - project_files = project.codebaseresources.files().has_no_relation() - from_resources = project_files.from_codebase() + project_files = project.codebaseresources.files() + from_resources = project_files.from_codebase().has_no_relation() to_resources = project_files.to_codebase() - for resource in from_resources.filter(extension=from_extension): - parts = resource.path[len(FROM) : -len(from_extension)] - matches = to_resources.filter(path=f"{TO}{parts}{to_extension}") - for match in matches: + for resource in from_resources.filter(name__endswith=from_extension): + to_name = resource.name.replace(from_extension, to_extension) + name_matches = to_resources.filter(name=to_name) + path_parts = Path(resource.path.lstrip("/")).parts + + match_by_similarity_count = defaultdict(list) + for match in name_matches: + path1 = "/".join(resource.path.split("/")[:-1]) + path2 = "/".join(match.path.split("/")[:-1]) + + similarity_count = count_similar_segments_reverse(path1, path2) + match_by_similarity_count[similarity_count].append(match) + + if not match_by_similarity_count: + continue + + max_similarity_count = max(match_by_similarity_count.keys()) + best_matches = match_by_similarity_count[max_similarity_count] + for match in best_matches: pipes.make_relationship( from_resource=resource, to_resource=match, relationship=CodebaseRelation.Relationship.COMPILED_TO, match_type="java_to_class", + extra_data={ + "path_score": f"{max_similarity_count + 1}/{len(path_parts) - 1}", + }, ) +# TODO: Remove duplication with java_to_class_match def java_to_inner_class_match(project): """Match a .java source to compiled $.class""" from_extension = ".java" to_extension = ".class" - project_files = project.codebaseresources.files().has_no_relation() + project_files = project.codebaseresources.files() from_resources = project_files.from_codebase() - to_resources = project_files.to_codebase() + to_resources = project_files.to_codebase().has_no_relation() - inner_classes = to_resources.filter(name__contains="$", extension=to_extension) - for to_resource in inner_classes: - parts = to_resource.path[len(TO) : -len(to_extension)] - source_java = "/".join(parts.split("/")[:-1] + to_resource.name.split("$")[:1]) - matches = from_resources.filter(path=f"{FROM}{source_java}{from_extension}") - for match in matches: + inner_classes = to_resources.filter(name__contains="$", name__endswith=to_extension) + for resource in inner_classes: + from_name = resource.name.split("$")[0] + from_extension + name_matches = from_resources.filter(name=from_name) + path_parts = Path(resource.path.lstrip("/")).parts + + match_by_similarity_count = defaultdict(list) + for match in name_matches: + path1 = "/".join(resource.path.split("/")[:-1]) + path2 = "/".join(match.path.split("/")[:-1]) + + similarity_count = count_similar_segments_reverse(path1, path2) + match_by_similarity_count[similarity_count].append(match) + + if not match_by_similarity_count: + continue + + max_similarity_count = max(match_by_similarity_count.keys()) + best_matches = match_by_similarity_count[max_similarity_count] + for match in best_matches: pipes.make_relationship( from_resource=match, - to_resource=to_resource, + to_resource=resource, relationship=CodebaseRelation.Relationship.COMPILED_TO, match_type="java_to_class", + extra_data={ + "path_score": f"{max_similarity_count + 1}/{len(path_parts) - 1}", + }, ) def path_match(project): """Match using path similarities.""" - project_files = project.codebaseresources.files().has_no_relation() - from_resources = project_files.from_codebase() + project_files = project.codebaseresources.files() + from_resources = project_files.from_codebase().has_no_relation() to_resources = project_files.to_codebase() for resource in from_resources: diff --git a/scanpipe/templates/scanpipe/codebase_relation.html b/scanpipe/templates/scanpipe/codebase_relation.html index 31a1ec0bd..1951f18bc 100644 --- a/scanpipe/templates/scanpipe/codebase_relation.html +++ b/scanpipe/templates/scanpipe/codebase_relation.html @@ -41,8 +41,10 @@ {% endif %} {{ relation.match_type }} + {% if relation.extra_data.path_score %} + {{ relation.extra_data.path_score }} + {% endif %} {% if relation.match_type == "path" %} - (score: {{ relation.extra_data.path_score }}) diff {% endif %} @@ -97,8 +99,10 @@ {% endif %} {{ relation.match_type }} + {% if relation.extra_data.path_score %} + {{ relation.extra_data.path_score }} + {% endif %} {% if relation.match_type == "path" %} - (score: {{ relation.extra_data.path_score }}) diff {% endif %}