Skip to content
This repository has been archived by the owner on Jan 4, 2023. It is now read-only.

Commit

Permalink
Major rewrite to update robots_txt for new metrics. (#236)
Browse files Browse the repository at this point in the history
* Major rewrite to update for new metrics.

Discussed in comment here: HTTPArchive/almanac.httparchive.org#2351 (comment).  The linter is going to complain and I will fix.  I stink at JS.

* Debugged errors and formatting.

* Addressed comment issue and tested more thoroughly

* updated comment clean and capture
  • Loading branch information
jroakes authored Oct 14, 2021
1 parent d7c8884 commit 084c813
Showing 1 changed file with 132 additions and 20 deletions.
152 changes: 132 additions & 20 deletions custom_metrics/robots_txt.js
Original file line number Diff line number Diff line change
@@ -1,42 +1,154 @@
//[robots_txt]

function fetchWithTimeout(url) {
// Extracts status, size, overall record count, and record counts respective to user-agents.

/*
Example Output:
{
"redirected": false,
"status": 200,
"size": 2279,
"size_kib": 2.2255859375,
"over_google_limit": false,
"comment_count": 19,
"record_counts": {
"by_type": {
"sitemap": 0,
"user_agent": 1,
"allow": 32,
"disallow": 36,
"crawl_delay": 1,
"noindex": 0,
"other": 0
},
"by_useragent": {
"*": {
"allow": 32,
"disallow": 36,
"crawl_delay": 1,
"noindex": 0,
"other": 0
}
}
}
}
*/

const fetchWithTimeout = (url) => {
var controller = new AbortController();
setTimeout(() => {controller.abort()}, 5000);
return fetch(url, {signal: controller.signal});
}

const RECORD_COUNT_TYPES = {
'sitemap': 'sitemap',
'user-agent': 'user_agent',
'allow': 'allow',
'disallow': 'disallow',
'crawl-delay': 'crawl_delay',
'noindex': 'noindex',
'other': 'other'
};

const BY_USERAGENT_TYPES = {
'allow': 'allow',
'disallow': 'disallow',
'crawl-delay': 'crawl_delay',
'noindex': 'noindex',
'other': 'other'
};

const parseRecords = (text)=>{

const cleanLines = (r)=>r.replace(/(\s+|^\s*)#.*$/gm, '').trim().toLowerCase();
const splitOnLines = (r)=>r.split(/[\r\n]+/g).filter((e)=>e.length > 0);
const lines = splitOnLines(cleanLines(text));

const rec_types = Object.keys(RECORD_COUNT_TYPES).join('|');
const regex = new RegExp(`(${rec_types})(?=\\s*:)`,'gi');

const records = [].map.call(lines, line=>{

let rec_match = line.match(regex);

if (rec_match) {
return {
record_type: rec_match[0].trim(),
record_value: line.slice(line.indexOf(':') + 1).trim()
};
}

return {
record_type: 'other',
record_value: line
};

}
);

return records;
}

return fetchWithTimeout('/robots.txt')
.then(r => {
let result = {};
result.redirected = !!r.redirected;
result.status = r.status;
return r.text().then(t => {

// Overall Metrics
result.size = t.length;
result.comment_lines = t.match(/^\s*#\s*(.*)$/gm)?.length ?? 0;
result.allow_lines = t.match(/^\s*allow\s*:\s*(.*?)\s*$/gmi)?.length ?? 0;
result.disallow_lines = t.match(/^\s*disallow\s*:\s*(.*?)\s*$/gmi)?.length ?? 0;
result.size_kib = t.length / 1024;
result.over_google_limit = result.size_kib > 500;
result.comment_count = t.match(/(\s+|^\s*)#.*$/gm)?.length ?? 0;
result.record_counts = {};

let userAgentMatches = t.matchAll(/^\s*user-agent\s*:\s*(.*?)\s*$/gmi);
if (userAgentMatches) {
result.user_agents = [];
// Parse Records to clean objects
const records = parseRecords(t);

for (const match of userAgentMatches) {
let c = match[1];
result.user_agents.push(c);
}
// Record counts by type of record
result.record_counts.by_type = {};
for (let rec_type of Object.keys(RECORD_COUNT_TYPES)) {
result.record_counts.by_type[RECORD_COUNT_TYPES[rec_type]] = records.filter((e)=>e['record_type'] == rec_type).length;
}

let sitemapMatches = t.matchAll(/^\s*sitemap\s*:\s*(.*?)\s*$/gmi);
if (sitemapMatches) {
result.sitemaps = [];

for (const match of sitemapMatches) {
let c = match[1];
result.sitemaps.push(c);
}

// Record counts by user-agent
counts_by_useragent = {};
var applies_to_useragent = [];
var last = null;

for (let record of records) {

if (record.record_type == 'user-agent') {

// If empty build
if (!(record.record_value in counts_by_useragent)) {
counts_by_useragent[record.record_value] = Object.values(BY_USERAGENT_TYPES).reduce((a,v)=>({
...a,
[v]: 0
}), {});
}

// If prior record UA, append to list, else create list of 1.
if (last == 'user-agent') {
applies_to_useragent.push(record.record_value);
} else {
applies_to_useragent = [record.record_value];
}

} else if (record.record_type in BY_USERAGENT_TYPES) {
for (ua of applies_to_useragent) {
counts_by_useragent[ua][BY_USERAGENT_TYPES[record.record_type]] += 1;
}

}

last = record.record_type;

}

result.record_counts.by_useragent = counts_by_useragent;

return JSON.stringify(result);
});
})
Expand Down

0 comments on commit 084c813

Please sign in to comment.