-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.ts
51 lines (43 loc) · 1.57 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import type { Element } from 'domhandler';
import { selectAll } from 'css-select';
import { getAttributeValue, getText } from 'domutils';
import { chain, allowRegex, ignoreDoubles } from 'crawler-ts/src';
import { createCrawler, allowHtml, allowProtocols } from 'crawler-ts-htmlparser2/src';
async function main() {
const hackerNewsPageRegex = /\/news\.ycombinator\.com\/news\?p=([\d]+)/;
const allowUrlRegex = allowRegex<URL>((url) => url.href);
// In this case we find the "?p=:page" piece in the URL and use it to detect duplicates
const ignorePageDoubles = ignoreDoubles<URL>((url) => {
const match = url.href.match(hackerNewsPageRegex);
const pageId = match?.[1];
return pageId ?? '';
});
// Only parse text/html
const shouldParse = allowHtml();
// Only queue links with
// - ignore already visited
const shouldQueue = chain(
allowProtocols(['http', 'https']),
// Allow news pages
allowUrlRegex([hackerNewsPageRegex]),
// Ignore already visited
ignorePageDoubles(),
);
const crawler = createCrawler({
shouldParse,
shouldQueue,
shouldYield: () => true,
});
const root = new URL('https://news.ycombinator.com/news');
for await (const { location, parsed } of crawler(root)) {
// Do something with the crawled result
const titleElements = selectAll('a.storylink', parsed);
const titles = titleElements.map((e) => ({
value: getText(e),
href: getAttributeValue(e as Element, 'href'),
}));
// Log all titles
titles.forEach((title) => console.log(title.href, title.value));
}
}
main();