Skip to content

Commit

Permalink
Added test framework for HTML reports (#2283)
Browse files Browse the repository at this point in the history
  • Loading branch information
shubham-yb authored Feb 26, 2025
1 parent ec1cb4f commit 1cd7f89
Show file tree
Hide file tree
Showing 6 changed files with 20,108 additions and 4 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/misc-migtests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,12 @@ jobs:
docker restart ${{ job.services.postgres.id }}
sleep 10
- name: Install python3 and psycopg2
- name: Install python3, psycopg2 and bs4
run: |
sudo apt install -y python3
sudo apt install -y libpq-dev
sudo apt install python3-psycopg2
pip3 install bs4 # Install BeautifulSoup4 module for HTML report validation
#TODO Remove the install PG 17 command once we do that in installer script
- name: Run installer script to setup voyager
Expand Down
233 changes: 233 additions & 0 deletions migtests/scripts/compare-html-reports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
#!/usr/bin/env python3

import argparse
import re
from bs4 import BeautifulSoup
import difflib
import sys
from collections import Counter

def normalize_text(text):
"""Normalize and clean up text content."""
text = text.replace('\xa0', ' ') # Replace non-breaking spaces with regular spaces
text = text.replace('\u200B', '') # Remove zero-width spaces
text = re.sub(r'\s+', ' ', text) # Collapse any whitespace sequences
return text.strip() # Remove leading/trailing spaces

def extract_and_normalize_texts(elements):
"""Extract and normalize inner text from HTML elements."""
return [normalize_text(el.get_text(separator=" ")) for el in elements]

def extract_divs(soup):
"""Extract and normalize divs, optionally sorting them based on certain conditions."""
all_divs = soup.find_all("div")
filtered_divs = []
should_sort = False

for div in all_divs:
# Sorting the content within "Sharding Recommendations" since the tables can be in different order
prev_h2 = div.find_previous("h2")
if prev_h2 and "Sharding Recommendations" in prev_h2.get_text():
should_sort = True
else:
should_sort = False # Reset should_sort for each div

# Skipping the parent wrapper div since it causes issues to be reported twice
if "wrapper" not in div.get("class", []): # Exclude wrapper divs
div_text = normalize_text(div.get_text(separator=" ")).strip()
if should_sort:
div_text = " ".join(sorted(div_text.split())) # Sort content within div
filtered_divs.append(div_text)

return sorted(filtered_divs) if any(s for s in filtered_divs) else filtered_divs


def extract_paragraphs(soup):
"""Extract and filter paragraphs, skipping specific ones based on conditions."""
paragraphs = soup.find_all("p")
skip_first_paragraph = False
filtered_paragraphs = []

for p in paragraphs:
# Skip the first <p> after the <h2> Migration Complexity Explanation since nesting causes issues to be reported twice
prev_element = p.find_previous("h2")
if prev_element and "Migration Complexity Explanation" in prev_element.get_text():
if not skip_first_paragraph:
skip_first_paragraph = True
continue
# Skip paragraph that starts with "Database Version:" as it vary according to the environment
if p.find("strong") and "Database Version:" in p.get_text():
continue
filtered_paragraphs.append(p)

return extract_and_normalize_texts(filtered_paragraphs)

def normalize_table_names(values):
"""Extract and normalize the table names from <td> and <th> and sort them."""
table_names = []
for v in values:
#Get the text and split it into lines
text_lines = v.get_text(separator="\n").split("\n")

#Filter out empty lines and normalize each line
normalized_names = [normalize_text(line).strip() for line in text_lines if line.strip()]

table_names.extend(normalized_names)
return sorted(table_names)

def extract_table_data(tables):
"""Sort tables by rows and their contents, including headers."""
sorted_tables = []
for table in tables:
rows = table.find_all("tr")
table_data = []
table_headers = []
for row in rows:
columns = row.find_all("td")
headers = row.find_all("th")
if len(columns) > 1:
table_data.append(normalize_table_names(columns))
if len(headers) > 1:
table_headers.append(normalize_table_names(headers))
if table_data:
sorted_tables.append(table_data)
if table_headers:
sorted_tables.append(table_headers)
return sorted_tables

def extract_html_data(html_content):
"""Main function to extract structured data from HTML content."""
soup = BeautifulSoup(html_content, 'html.parser')

data = {
"title": normalize_text(soup.title.string) if soup.title and soup.title.string else "No Title",
"headings": extract_and_normalize_texts(soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])),
"paragraphs": extract_paragraphs(soup),
"tables": extract_table_data(soup.find_all("table")),
"links": {
k: v for k, v in sorted(
{normalize_text(a.get("href") or ""): normalize_text(a.text) for a in soup.find_all("a")}.items()
)
},
"spans": extract_and_normalize_texts(soup.find_all("span")),
"divs": extract_divs(soup)
}

return data

def generate_diff_list(list1, list2, section_name, file1_path, file2_path):
"""Generate a structured diff for ordered lists (headings, paragraphs, spans, divs) showing only differences."""
if section_name == "tables":
# If list1 and list2 are already lists of table names, directly sort them
list1 = [sorted(table) if isinstance(table, list) else table for table in list1]
list2 = [sorted(table) if isinstance(table, list) else table for table in list2]
diff = list(difflib.unified_diff(
[f"{i+1}. {item}" for i, item in enumerate(list1)],
[f"{i+1}. {item}" for i, item in enumerate(list2)],
fromfile=f"{file1_path}",
tofile=f"{file2_path}",
lineterm="",
n=0 # Ensures ONLY differences are shown (no context lines)
))
return "\n".join(diff) or None # Return None if no differences

def dict_to_list(dict_data):
"""Convert dictionary to list of formatted strings."""
return [f"{k} -> {v}" for k, v in dict_data.items()]

def compare_html_tags(html_data1, html_data2):
"""Compare the unique tags and their counts in the two HTML reports."""

def get_tag_counts(html_content):
"""Extracts all tag names and their counts from the given HTML content."""
soup = BeautifulSoup(html_content, 'html.parser')
return Counter(tag.name for tag in soup.find_all())

tags_count1 = get_tag_counts(html_data1)
tags_count2 = get_tag_counts(html_data2)

tags1 = set(tags_count1.keys())
tags2 = set(tags_count2.keys())

missing_tags_in_file1 = tags2 - tags1 # Tags in file2 but missing in file1
missing_tags_in_file2 = tags1 - tags2 # Tags in file1 but missing in file2

differences = {}

if missing_tags_in_file1:
differences["extra_tags_in_the_actual_report"] = "\n".join(missing_tags_in_file1)
if missing_tags_in_file2:
differences["missing_tags_in_the_actual_report"] = "\n".join(missing_tags_in_file2)

# Check for tag count mismatches
tag_count_mismatches = {
tag: f"Expected: {tags_count1[tag]}, Found: {tags_count2[tag]}"
for tag in (tags1 & tags2) # Tags present in both files
if tags_count1[tag] != tags_count2[tag]
}

if tag_count_mismatches:
differences["tag_count_mismatches"] = "\n".join(
f"{tag}: {count}" for tag, count in tag_count_mismatches.items()
)

return differences

def compare_html_data(html_data1, html_data2, file1, file2):

differences = {}

for section in html_data1.keys():
if html_data1[section] != html_data2[section]:
if isinstance(html_data1[section], list): # For headings, paragraphs, spans, divs
diff = generate_diff_list(html_data1[section], html_data2[section], section, file1, file2)
elif isinstance(html_data1[section], dict): # For links dictionary
diff = generate_diff_list(
dict_to_list(html_data1[section]),
dict_to_list(html_data2[section]),
section, file1, file2
)
else: # Title (single string)
diff = generate_diff_list([html_data1[section]], [html_data2[section]], section, file1, file2)

if diff: # Only store sections that have differences
differences[section] = diff

return differences

def read_and_compare_html_files(file1, file2):
"""Read and extract structured data from HTML files."""
with open(file1, "r", encoding="utf-8") as f1, open(file2, "r", encoding="utf-8") as f2:
html_data1 = f1.read()
html_data2 = f2.read()

tags_differences = compare_html_tags(html_data1, html_data2)
data_differences = compare_html_data(extract_html_data(html_data1), extract_html_data(html_data2), file1, file2)

# Print differences here
if not tags_differences and not data_differences:
print("The reports are matching.")
else:
print("The reports are not matching:")

if tags_differences:
print("\n=== TAG DIFFERENCES ===")
for section, diff_text in tags_differences.items():
print(f"\n=== {section.replace('_', ' ').upper()} ===")
print(diff_text)

if data_differences:
for section, diff_text in data_differences.items():
print(f"\n=== {section.upper()} DIFFERENCES ===")
print(diff_text)

sys.exit(1)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Compare two HTML reports.")
parser.add_argument("report1", help="Path to the first HTML report")
parser.add_argument("report2", help="Path to the second HTML report")

args = parser.parse_args()
read_and_compare_html_files(args.report1, args.report2)
16 changes: 13 additions & 3 deletions migtests/scripts/run-validate-assessment-report.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,22 @@ main() {
# Checking if the assessment reports were created
if [ -f "${EXPORT_DIR}/assessment/reports/migration_assessment_report.html" ] && [ -f "${EXPORT_DIR}/assessment/reports/migration_assessment_report.json" ]; then
echo "Assessment reports created successfully."

echo "Checking for Failures"
validate_failure_reasoning "${EXPORT_DIR}/assessment/reports/migration_assessment_report.json"

echo "Comparing Report contents"
expected_file="${TEST_DIR}/expectedAssessmentReport.json"
actual_file="${EXPORT_DIR}/assessment/reports/migration_assessment_report.json"
compare_json_reports ${expected_file} ${actual_file}

echo "Comparing JSON report"
expected_json_file="${TEST_DIR}/expectedAssessmentReport.json"
actual_json_file="${EXPORT_DIR}/assessment/reports/migration_assessment_report.json"
compare_json_reports ${expected_json_file} ${actual_json_file}

echo "Comparing HTML report"
expected_html_file="${TEST_DIR}/expectedAssessmentReport.html"
actual_html_file="${EXPORT_DIR}/assessment/reports/migration_assessment_report.html"
${SCRIPTS}/compare-html-reports.py ${expected_html_file} ${actual_html_file}

else
echo "Error: Assessment reports were not created successfully."
cat_log_file "yb-voyager-assess-migration.log"
Expand Down
Loading

0 comments on commit 1cd7f89

Please sign in to comment.