-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathyomiscraper.js
132 lines (122 loc) · 3.67 KB
/
yomiscraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
var fs = require('fs');
var jsdom = require("jsdom");
var request = require('request');
var url = require('url');
var collectedLinks = [];
var i = 0;
var maxPage = 2;
function getLinks(startingNumber)
{
function crawlURL(URL,callback)
{
setTimeout(function() {
request(URL, function (err, response, body) {
if(err != null)
{
if(err.code == "ETIMEDOUT")
{
console.log(this.uri.href,"timed out. Retrying in 10 seconds...");
setTimeout(function() {crawlURL(URL,callback);}, 10000);
}
else if(err.code == "ENOTFOUND")
{
console.log(this.uri.href,"not found");
if(i == maxPage)
{
console.log("Crawl complete");
}
else getLinks(++i);
}
else
{
console.log("Unknown error:\n",err,"\nStopping at",startingNumber);
process.exit(1);
}
}
else if (!err)
{
if(this.uri.href.includes(".nifty.com")) collectedLinks.push(this.uri.href);
else if(response.statusCode == 200)
{
callback(this.uri.href,response);
}
else if(response.statusCode > 500)
{
console.log(this.uri.href,"returned",response.statusCode,response.statusMessage,"Retrying in 10 seconds...");
setTimeout(function() {crawlURL(URL,callback);}, 10000);
}
else if(response.statusCode == 404 || response.statusCode == 403)
{
console.log(this.uri.href,"returned",response.statusCode,response.statusMessage);
if(i == maxPage)
{
console.log("Crawl complete");
}
else getLinks(++i);
}
else
{
console.log("Unknown error with",this.uri.href,"\n",response.statusCode,response.statusMessage,"\nStopping at",startingNumber);
process.exit(1);
}
}
});
}, Math.random() * (350 - 275) + 275);
}
crawlURL("http://gameofserch.com/y.cgi?page=" + startingNumber + "&mode=search&sort=time_new&word=nifty&engine=pre&search_kt=014_001-b_all&search_day=&use_str=&method=and", function processData(finalUrl,response)
{
jsdom.env({html:response.body,url:finalUrl,done:function (err, window) {
var links = window.document.links;
for(var l = 0; l < links.length; l++)
{
var urlQuery = url.parse(links[l].href,true).query["url"];
if(urlQuery && urlQuery.includes("nifty")) collectedLinks.push(urlQuery);
//else if(links[l].href.includes("mode=link&id=")) crawlURL(links[l].href,processData);
else if(/page=([0-9]+)/.test(links[l].href))
{
if(parseInt(/page=([0-9]+)/.exec(links[l].href)[1]) > 6)
{
console.log("wierdness at",finalUrl);
process.exit(1);
}
maxPage = Math.max(maxPage, parseInt(/page=([0-9]+)/.exec(links[l].href)[1]));
}
}
console.log("Crawled",finalUrl);
if(i == maxPage)
{
console.log("Crawl complete");
}
else getLinks(++i);
}});
});
function findHomeURL(table,phrase)
{
var possibleURL = Array.prototype.find.call(table.querySelectorAll("th"),th => th.textContent.includes(phrase));
if(possibleURL) return possibleURL.nextElementSibling.textContent;
else return null;
}
}
(function myLoop (l) {
getLinks(++i);
setTimeout(function () {
if (--l) myLoop(l);
}, Math.random() * (350 - 275) + 275)
})(4);
process.on ('exit', function (code) {
collectedLinks = collectedLinks.filter((link,i,array) => array.lastIndexOf(link) == i);
console.log(collectedLinks.length,"Nifty links found");
console.log("Writing to output");
var out = fs.writeFileSync("output.txt", collectedLinks.join("\r\n"));
console.log("Done");
process.exit (code);
})
process.on('SIGINT', function()
{
console.log("Crawling aborted");
process.exit(0);
})
process.on('uncaughtException', function (err) {
console.log(err);
process.exit(1);
})