-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrap.tsx
168 lines (137 loc) · 7.91 KB
/
scrap.tsx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import puppeteer from 'puppeteer';
import fs from 'fs';
async function main() {
const browser = await puppeteer.launch({
headless: true,
defaultViewport: null, // Override default viewport
args: [
'--disable-web-security', // Disable web security to bypass CORS
'--disable-gpu', // Disable GPU acceleration
'--no-sandbox', // Disable sandbox mode
'--disable-setuid-sandbox', // Disable setuid sandbox
'--disable-dev-shm-usage', // Disable /dev/shm usage
'--disable-accelerated-2d-canvas', // Disable 2D canvas acceleration
'--disable-gl-drawing-for-tests', // Disable GL drawing for tests
'--disable-accelerated-video-decode', // Disable accelerated video decode
'--disable-accelerated-mjpeg-decode', // Disable accelerated MJPEG decode
'--disable-accelerated-webgl', // Disable accelerated WebGL
'--disable-accelerated-overflow-scroll', // Disable accelerated overflow scroll
'--disable-threaded-animation', // Disable threaded animation
'--disable-threaded-scrolling', // Disable threaded scrolling
'--disable-breakpad', // Disable crash reporting
'--ignore-certificate-errors', // Ignore certificate errors
'--ignore-certificate-errors-spki-list', // Ignore certificate errors SPKI list
'--disable-infobars', // Disable infobars
'--disable-popup-blocking', // Disable popup blocking
'--disable-translate', // Disable translation
'--disable-extensions', // Disable extensions
'--disable-default-apps', // Disable default apps
'--disable-prompt-on-repost', // Disable prompt on repost
'--disable-background-networking', // Disable background networking
'--disable-background-timer-throttling', // Disable background timer throttling
'--disable-backgrounding-occluded-windows', // Disable backgrounding occluded windows
'--disable-renderer-backgrounding', // Disable renderer backgrounding
'--disable-webgl', // Disable WebGL
'--disable-webgl2', // Disable WebGL 2.0
'--disable-client-side-phishing-detection', // Disable client-side phishing detection
'--disable-es3-apis', // Disable ES3 APIs
'--disable-es3-gl-context', // Disable ES3 GL context
'--mute-audio' // Mute audio
],
});
const page = await browser.newPage();
console.log("going there..");
await page.setRequestInterception(true); // Enable request interception
// Disable unnecessary resources (images, CSS, fonts)
page.on('request', (request) => {
if (
request.resourceType() === 'image' ||
request.resourceType() === 'stylesheet' ||
request.resourceType() === 'font'
) {
request.abort();
} else {
request.continue();
}
});
console.log("going to the first website ..");
await page.goto('https://www.ethiopianreporter.com/128616/');
console.log("Fetching the data from the first website...");
let post1: string | null;
let post2: string | null;
let post3: string | null;
let post4: string | null;
let post5: string | null;
let reporterPost : string;
let fanaPost : string;
let pressPost : string;
// Evaluate the page to extract the posts
await page.evaluate(() => {
post1 = document.querySelectorAll(".tdb-block-inner.td-fix-index p")[0].textContent;
post2 = document.querySelectorAll(".tdb-block-inner.td-fix-index p")[1].textContent;
post3 = document.querySelectorAll(".tdb-block-inner.td-fix-index p")[2].textContent;
console.log("the 1st posts are:", post1, post2, post3);
// Return the extracted posts to be accessible outside the evaluate function
return { post1, post2, post3 };
}).then( async ({ post1, post2, post3 }) => {
// Create a text file and write the posts to it
reporterPost = `# ስለ ፡ ደስታ ፡ ሮቦት ፡ ሚዲያዎች ፡ ምን ፡ አሉ? \n # ሪፖርተር (ethiopianreporter.com) ከራሱ ፡ ዌብሳይት ፡ እንደሚከተለው ፡ ዘግቦታል ፦ \n ${post1}\n${post2}\n${post3} \n => To read from their own website, please use this link: https://www.ethiopianreporter.com/128616/`;
fs.writeFileSync('posts.txt', reporterPost);
console.log('reporter posts added successfully! continuing to the next one...');
}).catch(()=>{
const errorMsg = "\n An Unknown error occured! the posts are not written or partially written, please retry running the code it may be fixed"
fs.appendFileSync('posts.txt', errorMsg);
console.log(errorMsg)
});
console.log("going to the second website ..");
await page.goto('https://www.fanabc.com/archives/242013');
console.log("Fetching the data from the second website...");
await page.evaluate(() => {
post1 = document.querySelectorAll(".entry-content.clearfix.single-post-content p")[0].textContent;
post2 = document.querySelectorAll(".entry-content.clearfix.single-post-content p")[1].textContent;
post3 = document.querySelectorAll(".entry-content.clearfix.single-post-content p")[2].textContent;
post4 = document.querySelectorAll(".entry-content.clearfix.single-post-content p")[3].textContent;
post5 = document.querySelectorAll(".entry-content.clearfix.single-post-content p")[4].textContent;
console.log("the 2nd posts are:", post1,
post2, post3, post4, post5
);
// Return the extracted posts to be accessible outside the evaluate function
return { post1,
post2, post3, post4, post5
};
}).then( async ({ post1,
post2, post3, post4, post5
}) => {
fanaPost = `\n\n\n# ፋና ብሮድካስቲንግ ኮርኮሬት (fanabc.com) ከራሱ ፡ ዌብሳይት ፡ እንደሚከተለው ፡ ዘግቦታል ፦ \n${post1}\n${post2}\n${post3}\n${post4}\n${post5}\n => To read from their own website, please use this link: https://www.fanabc.com/archives/242013`;
fs.appendFileSync('posts.txt', fanaPost);
console.log('fanabc posts added successfully! continuing to the next one...');
})
console.log("going to the third website ..");
await page.goto('https://www.fanabc.com/archives/242013');
console.log("Fetching the data from the third website...");
// post-title
await page.evaluate(() => {
post1 = document.querySelectorAll(".entry-content p")[0].textContent;
post2 = document.querySelectorAll(".entry-content p")[1].textContent;
post3 = document.querySelectorAll(".entry-content p")[2].textContent;
post4 = document.querySelectorAll(".entry-content p")[3].textContent;
// Return the extracted posts to be accessible outside the evaluate function
return { post1,
post2, post3,
};
}).then( async ({ post1,
post2, post3,
}) => {
pressPost = `\n\n\n# ኢትዮጵያን ፕረስ (press.et) ከራሱ ፡ ዌብሳይት ፡ እንደሚከተለው ፡ ዘግቦታል ፦ ${post1}\n${post2}\n${post3}\n => To read from their own website, please use this link: https://press.et/?p=125525 \n\n`;
fs.appendFileSync('posts.txt', pressPost);
console.log('press posts added successfully!', 'All relevant posts about the Robot fetched from different websites on the internet, open the text file now to read the posts.');
})
const data = `
Contributor:
Jemal Muhammed
`;
fs.appendFileSync('posts.txt', data);
console.log('Done');
await browser.close();
}
main();