-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBFS.js
139 lines (113 loc) · 5.03 KB
/
BFS.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
const { chromium } = require('playwright');
const cheerio = require('cheerio');
const fs = require('fs');
const path = require('path');
const crypto = require('crypto'); // For hashing filenames
const url = require('url');
const websiteUrl = 'https://www.tu-sofia.bg/';
const outputDir = './output';
const baseDomain = new URL(websiteUrl).hostname; // Extract the base domain
const ignoredDomainsRegex = /facebook\.com|linkedin\.com|youtube\.com|focus-news\.net|novini\.bg|sliveninfo\.bg|utroruse\.com|trafficnews\.bg|pressoffice\.tu-sofia\.bg|career\.tu-sofia\.bg|digilib\.nalis\.bg|proceedings\.tu-sofia\.bg|sopkoni\.tu-sofia\.bg|elara\.tu-sofia\.bg|design\.tu-sofia\.bg|otsk-nk\.tu-sofia\.bg|rcvt\.tu-sofia\.bg|e-university\.tu-sofia\.bg|ef-conference\.tu-sofia\.bg|infotech-bg\.com|bultrans\.org|metrology-bg\.org|konkursi-as\.tu-sofia\.bg|google\.com/i;
(async () => {
const browser = await chromium.launch();
const page = await browser.newPage();
// Intercept network requests to handle file downloads
await page.route('**/*', async (route) => {
const request = route.request();
const fileUrl = request.url();
// Determine file type by extension
const fileExtensions = ['.pdf', '.avi', '.mp4', '.jpg', '.png', '.zip', '.rar', '.doc', '.docx', '.xls', '.xlsx'];
const extension = path.extname(fileUrl).toLowerCase();
if (fileExtensions.includes(extension)) {
console.log(`Downloading file from ${fileUrl}`);
let buffer;
const maxRetries = 3;
let retries = 0;
while (retries < maxRetries) {
try {
// Fetch the file content
const response = await page.request.fetch(request);
buffer = await response.body();
break; // Exit loop if successful
} catch (error) {
retries++;
console.log(`Failed to download ${fileUrl}. Retry ${retries}/${maxRetries}`);
if (retries === maxRetries) {
console.log(`Skipping ${fileUrl} after ${maxRetries} retries.`);
return route.abort(); // Abort the request after max retries
}
}
}
if (buffer) {
// Hash the file path to avoid long filenames
const hash = crypto.createHash('md5').update(fileUrl).digest('hex');
const urlObj = new URL(fileUrl);
const directory = path.join(outputDir, urlObj.hostname);
const filePath = path.join(directory, `${hash}${extension}`);
// Ensure directory exists
fs.mkdirSync(directory, { recursive: true });
fs.writeFileSync(filePath, buffer);
}
return route.abort(); // Abort the navigation
}
// Continue navigation for HTML pages
return route.continue();
});
const crawledPages = new Set();
const queue = [websiteUrl];
while (queue.length > 0) {
const currentPageUrl = queue.shift();
if (crawledPages.has(currentPageUrl)) continue;
crawledPages.add(currentPageUrl);
console.log(`Crawling ${currentPageUrl}`);
try {
const currentUrlObj = new URL(currentPageUrl);
// Check if the URL belongs to the base domain
if (currentUrlObj.hostname !== baseDomain) {
console.log(`Skipping ${currentPageUrl} - Outside of base domain`);
continue; // Skip URLs outside of the base domain
}
await page.goto(currentPageUrl, { timeout: 60000 });
// Wait for the page to be fully loaded
await page.waitForLoadState('networkidle'); // Ensures all network activity is finished
// Extract the content safely
let html;
try {
html = await page.content();
} catch (error) {
console.log(`Error retrieving content for ${currentPageUrl}: ${error.message}`);
continue; // Skip to the next URL
}
const $ = cheerio.load(html);
// Extract text content
const textContent = $('body').text().trim();
const urlObj = new URL(currentPageUrl);
const hostname = urlObj.hostname;
// Hash the file path to avoid long filenames
const hash = crypto.createHash('md5').update(urlObj.pathname).digest('hex');
const textFilePath = path.join(outputDir, hostname, hash, 'index.txt');
fs.mkdirSync(path.dirname(textFilePath), { recursive: true });
fs.writeFileSync(textFilePath, textContent);
// Find new links to crawl
const newLinks = [];
$('a').each((index, element) => {
let href = $(element).attr('href');
if (href) {
href = new URL(href, currentPageUrl).href; // Convert to absolute URL
const linkUrlObj = new URL(href);
const linkHostname = linkUrlObj.hostname;
// Skip ignored domains and URLs outside the base domain
if (!ignoredDomainsRegex.test(linkHostname) && linkHostname === baseDomain) {
newLinks.push(href);
}
}
});
// Add new links to the queue for BFS
queue.push(...newLinks);
} catch (error) {
console.log(`Error loading or processing ${currentPageUrl}: ${error.message}`);
continue; // Skip to the next URL
}
}
await browser.close();
})();