Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added framework to test HTML reports #2283

Merged
merged 21 commits into from
Feb 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/misc-migtests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,12 @@ jobs:
docker restart ${{ job.services.postgres.id }}
sleep 10

- name: Install python3 and psycopg2
- name: Install python3, psycopg2 and bs4
run: |
sudo apt install -y python3
sudo apt install -y libpq-dev
sudo apt install python3-psycopg2
pip3 install bs4 # Install BeautifulSoup4 module for HTML report validation

#TODO Remove the install PG 17 command once we do that in installer script
- name: Run installer script to setup voyager
Expand Down
233 changes: 233 additions & 0 deletions migtests/scripts/compare-html-reports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
#!/usr/bin/env python3

import argparse
import re
from bs4 import BeautifulSoup
import difflib
import sys
from collections import Counter

def normalize_text(text):
"""Normalize and clean up text content."""
text = text.replace('\xa0', ' ') # Replace non-breaking spaces with regular spaces
text = text.replace('\u200B', '') # Remove zero-width spaces
text = re.sub(r'\s+', ' ', text) # Collapse any whitespace sequences
return text.strip() # Remove leading/trailing spaces

def extract_and_normalize_texts(elements):
"""Extract and normalize inner text from HTML elements."""
return [normalize_text(el.get_text(separator=" ")) for el in elements]

def extract_divs(soup):
"""Extract and normalize divs, optionally sorting them based on certain conditions."""
all_divs = soup.find_all("div")
filtered_divs = []
should_sort = False

for div in all_divs:
# Sorting the content within "Sharding Recommendations" since the tables can be in different order
prev_h2 = div.find_previous("h2")
if prev_h2 and "Sharding Recommendations" in prev_h2.get_text():
should_sort = True
else:
should_sort = False # Reset should_sort for each div

# Skipping the parent wrapper div since it causes issues to be reported twice
if "wrapper" not in div.get("class", []): # Exclude wrapper divs
div_text = normalize_text(div.get_text(separator=" ")).strip()
if should_sort:
div_text = " ".join(sorted(div_text.split())) # Sort content within div
filtered_divs.append(div_text)

return sorted(filtered_divs) if any(s for s in filtered_divs) else filtered_divs


def extract_paragraphs(soup):
"""Extract and filter paragraphs, skipping specific ones based on conditions."""
paragraphs = soup.find_all("p")
skip_first_paragraph = False
filtered_paragraphs = []

for p in paragraphs:
# Skip the first <p> after the <h2> Migration Complexity Explanation since nesting causes issues to be reported twice
prev_element = p.find_previous("h2")
if prev_element and "Migration Complexity Explanation" in prev_element.get_text():
if not skip_first_paragraph:
skip_first_paragraph = True
continue
# Skip paragraph that starts with "Database Version:" as it vary according to the environment
if p.find("strong") and "Database Version:" in p.get_text():
continue
filtered_paragraphs.append(p)

return extract_and_normalize_texts(filtered_paragraphs)

def normalize_table_names(values):
"""Extract and normalize the table names from <td> and <th> and sort them."""
table_names = []
for v in values:
#Get the text and split it into lines
text_lines = v.get_text(separator="\n").split("\n")

#Filter out empty lines and normalize each line
normalized_names = [normalize_text(line).strip() for line in text_lines if line.strip()]

table_names.extend(normalized_names)
return sorted(table_names)

def extract_table_data(tables):
"""Sort tables by rows and their contents, including headers."""
sorted_tables = []
for table in tables:
rows = table.find_all("tr")
table_data = []
table_headers = []
for row in rows:
columns = row.find_all("td")
headers = row.find_all("th")
if len(columns) > 1:
table_data.append(normalize_table_names(columns))
if len(headers) > 1:
table_headers.append(normalize_table_names(headers))
if table_data:
sorted_tables.append(table_data)
if table_headers:
sorted_tables.append(table_headers)
return sorted_tables

def extract_html_data(html_content):
"""Main function to extract structured data from HTML content."""
soup = BeautifulSoup(html_content, 'html.parser')

data = {
"title": normalize_text(soup.title.string) if soup.title and soup.title.string else "No Title",
"headings": extract_and_normalize_texts(soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])),
"paragraphs": extract_paragraphs(soup),
"tables": extract_table_data(soup.find_all("table")),
"links": {
k: v for k, v in sorted(
{normalize_text(a.get("href") or ""): normalize_text(a.text) for a in soup.find_all("a")}.items()
)
},
"spans": extract_and_normalize_texts(soup.find_all("span")),
"divs": extract_divs(soup)
}

return data

def generate_diff_list(list1, list2, section_name, file1_path, file2_path):
"""Generate a structured diff for ordered lists (headings, paragraphs, spans, divs) showing only differences."""
if section_name == "tables":
# If list1 and list2 are already lists of table names, directly sort them
list1 = [sorted(table) if isinstance(table, list) else table for table in list1]
list2 = [sorted(table) if isinstance(table, list) else table for table in list2]
diff = list(difflib.unified_diff(
[f"{i+1}. {item}" for i, item in enumerate(list1)],
[f"{i+1}. {item}" for i, item in enumerate(list2)],
fromfile=f"{file1_path}",
tofile=f"{file2_path}",
lineterm="",
n=0 # Ensures ONLY differences are shown (no context lines)
))
return "\n".join(diff) or None # Return None if no differences

def dict_to_list(dict_data):
"""Convert dictionary to list of formatted strings."""
return [f"{k} -> {v}" for k, v in dict_data.items()]

def compare_html_tags(html_data1, html_data2):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comparing unique tags is one.

How about one enhancement - also match the counts for each tags(number of times each tag appeared)
I guess it should be small change, lib might be providing that.

"""Compare the unique tags and their counts in the two HTML reports."""

def get_tag_counts(html_content):
"""Extracts all tag names and their counts from the given HTML content."""
soup = BeautifulSoup(html_content, 'html.parser')
return Counter(tag.name for tag in soup.find_all())

tags_count1 = get_tag_counts(html_data1)
tags_count2 = get_tag_counts(html_data2)

tags1 = set(tags_count1.keys())
tags2 = set(tags_count2.keys())

missing_tags_in_file1 = tags2 - tags1 # Tags in file2 but missing in file1
missing_tags_in_file2 = tags1 - tags2 # Tags in file1 but missing in file2

differences = {}

if missing_tags_in_file1:
differences["extra_tags_in_the_actual_report"] = "\n".join(missing_tags_in_file1)
if missing_tags_in_file2:
differences["missing_tags_in_the_actual_report"] = "\n".join(missing_tags_in_file2)

# Check for tag count mismatches
tag_count_mismatches = {
tag: f"Expected: {tags_count1[tag]}, Found: {tags_count2[tag]}"
for tag in (tags1 & tags2) # Tags present in both files
if tags_count1[tag] != tags_count2[tag]
}

if tag_count_mismatches:
differences["tag_count_mismatches"] = "\n".join(
f"{tag}: {count}" for tag, count in tag_count_mismatches.items()
)

return differences

def compare_html_data(html_data1, html_data2, file1, file2):

differences = {}

for section in html_data1.keys():
if html_data1[section] != html_data2[section]:
if isinstance(html_data1[section], list): # For headings, paragraphs, spans, divs
diff = generate_diff_list(html_data1[section], html_data2[section], section, file1, file2)
elif isinstance(html_data1[section], dict): # For links dictionary
diff = generate_diff_list(
dict_to_list(html_data1[section]),
dict_to_list(html_data2[section]),
section, file1, file2
)
else: # Title (single string)
diff = generate_diff_list([html_data1[section]], [html_data2[section]], section, file1, file2)

if diff: # Only store sections that have differences
differences[section] = diff

return differences

def read_and_compare_html_files(file1, file2):
"""Read and extract structured data from HTML files."""
with open(file1, "r", encoding="utf-8") as f1, open(file2, "r", encoding="utf-8") as f2:
html_data1 = f1.read()
html_data2 = f2.read()

tags_differences = compare_html_tags(html_data1, html_data2)
data_differences = compare_html_data(extract_html_data(html_data1), extract_html_data(html_data2), file1, file2)

# Print differences here
if not tags_differences and not data_differences:
print("The reports are matching.")
else:
print("The reports are not matching:")

if tags_differences:
print("\n=== TAG DIFFERENCES ===")
for section, diff_text in tags_differences.items():
print(f"\n=== {section.replace('_', ' ').upper()} ===")
print(diff_text)

if data_differences:
for section, diff_text in data_differences.items():
print(f"\n=== {section.upper()} DIFFERENCES ===")
print(diff_text)

sys.exit(1)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Compare two HTML reports.")
parser.add_argument("report1", help="Path to the first HTML report")
parser.add_argument("report2", help="Path to the second HTML report")

args = parser.parse_args()
read_and_compare_html_files(args.report1, args.report2)
16 changes: 13 additions & 3 deletions migtests/scripts/run-validate-assessment-report.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,22 @@ main() {
# Checking if the assessment reports were created
if [ -f "${EXPORT_DIR}/assessment/reports/migration_assessment_report.html" ] && [ -f "${EXPORT_DIR}/assessment/reports/migration_assessment_report.json" ]; then
echo "Assessment reports created successfully."

echo "Checking for Failures"
validate_failure_reasoning "${EXPORT_DIR}/assessment/reports/migration_assessment_report.json"

echo "Comparing Report contents"
expected_file="${TEST_DIR}/expectedAssessmentReport.json"
actual_file="${EXPORT_DIR}/assessment/reports/migration_assessment_report.json"
compare_json_reports ${expected_file} ${actual_file}

echo "Comparing JSON report"
expected_json_file="${TEST_DIR}/expectedAssessmentReport.json"
actual_json_file="${EXPORT_DIR}/assessment/reports/migration_assessment_report.json"
compare_json_reports ${expected_json_file} ${actual_json_file}

echo "Comparing HTML report"
expected_html_file="${TEST_DIR}/expectedAssessmentReport.html"
actual_html_file="${EXPORT_DIR}/assessment/reports/migration_assessment_report.html"
${SCRIPTS}/compare-html-reports.py ${expected_html_file} ${actual_html_file}

else
echo "Error: Assessment reports were not created successfully."
cat_log_file "yb-voyager-assess-migration.log"
Expand Down
Loading