-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml2json.js
executable file
·131 lines (129 loc) · 4.22 KB
/
html2json.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/node
const express = require('express');
const bodyParser = require('body-parser');
let request=require("request");
request = request.defaults({headers:{"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}});
const jsdom = require("jsdom");
const fs = require("fs");
const zlib = require('zlib');
const jquery = fs.readFileSync("./lib/jquery.min.js", "utf-8");
const purl = require('./lib/purl.js');
const app = express();
const jsonParser = bodyParser.json({ limit:'100MB' ,extended: false });
const iconv=require("iconv-lite");
const fileOptions={
root:"./public"
};
app.get('/',function(req,res){
res.sendFile('index.html',fileOptions);
});
app.get('/ext/:file',function(req,res){
res.sendFile('ext/'+req.params.file,fileOptions);
});
app.post('/request', jsonParser, function (req, res) {
res.set('Content-Type','application/json');
var query = req._parsedUrl.path;
var response = new Object();
query=purl("http://localhost"+query).data.param.query;
var data=req.body;
try{
request.post({uri:decodeURIComponent(query["url"]),encoding:null},{"json":data},(e,r,b)=>{
if(e){
if(e.code=='EAI_AGAIN'&&e.syscall=='getaddrinfo'){
response.status="error";
response.message=e.hostname+" could not be resolved!";
res.end(JSON.stringify(response));
}
}else{
proccessResponse(res,b,r.headers,query["selector"]);
}
});
}catch(er){
response.status="error";
response.message=er.toString();
res.end(JSON.stringify(response));
}
});
app.get('/request', function (req, res) {
res.set('Content-Type','application/json');
var query = req._parsedUrl.path;
var response = new Object();
query=purl("http://localhost"+query).data.param.query;
try{
request.get({uri:decodeURIComponent(query["url"]),encoding:null},(e,r,b)=>{
if(e){
if(e.code=='EAI_AGAIN'&&e.syscall=='getaddrinfo'){
response.status="error";
response.message=e.hostname+" could not be resolved!";
res.end(JSON.stringify(response));
}
}else{
proccessResponse(res,b,r.headers,query["selector"]);
}
});
}catch(er){
response.status="error";
response.message=er.toString();
res.end(JSON.stringify(response));
}
});
function proccessResponse(res,html,headers,selector){
var response = new Object();
if(headers["content-encoding"] && headers["content-encoding"]=='gzip'){
html=zlib.gunzipSync(html);
}
if(headers['content-type'].toLowerCase()=='text/html; charset=windows-1251'){
html=iconv.decode(html,"win1251");
}else
html=iconv.decode(html,"utf-8");
jsdom.env({html:html,src:[jquery],done:(err,window)=>{
var $=window.$;
function html2json(selector){
function parseElement(v){
var e = new Object();
e.attr=new Object();
$.each(v.attributes,(k,a)=>{
e.attr[a.name]=a.value;
})
e.tagName=v.tagName;
if(v.childNodes.length>0){
e.children=new Array();
$.each(v.childNodes,(k,el)=>{
var c=new Object();
if(el.nodeType==3){
c.type="text";
c.value=el.wholeText;
}else if(el.nodeType==1){
c.type="block";
c=parseElement(el);
}
e.children.push(c);
});
}
return e;
}
var o=new Array();
$(selector).each((k,v)=>{o.push(parseElement(v));});
return JSON.parse(JSON.stringify(o));
}
try{
if(selector!=undefined&&selector.length>0){
response.response=html2json(selector);
}else{
response.response={};
response.response.head=html2json("head");
response.response.body=html2json("body");
}
response.status="success";
}catch(er){
response.status="error";
response.message=er.toString();
}
res.end(JSON.stringify(response));
}});
}
var server = app.listen(1608,"localhost", function () {
var host = server.address().address;
var port = server.address().port;
console.log("HTML2JSON listening at http://%s:%s", host, port)
});