Skip to content

Commit

Permalink
fix(crawler/fl): add fix for the latest markup shenanigans
Browse files Browse the repository at this point in the history
  • Loading branch information
kkkrist committed Mar 2, 2021
1 parent 0ba8d83 commit fc33afb
Showing 1 changed file with 9 additions and 5 deletions.
14 changes: 9 additions & 5 deletions packages/crawler/lib/fl.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ const JSDOM = require('jsdom').JSDOM
const fetchOptions = require('./fetch-options.json')

const rDate = /^([0-9]+)\.([0-9]+)\.([0-9]+)$/
const rDeaths = [/verstorben:?\s+([0-9.]+)/i, /([0-9.]+)[\D]+Verst(?:or|ro)?ben/i]
const rDeaths = [
/verstorben:?\s+([0-9.]+)/i,
/([0-9.]+)[\D]+Verst(?:or|ro)?ben/i
]
const rInfected = [/Positive gesamt:?\s+([0-9.]+)/i, /([0-9.]+)[\D]+Infizierte/]
const rQuarantined = [
/Quarantänefälle:?\s+([0-9.]+)/i,
Expand Down Expand Up @@ -76,7 +79,7 @@ const getRecord = el => {

const reducer = (acc, el) => {
if (
el.nodeName !== 'P' &&
!['P', 'UL'].includes(el.nodeName) &&
el.childElementCount > 0 &&
![...el.children].every(({ tagName }) => tagName === 'BR')
) {
Expand Down Expand Up @@ -105,9 +108,10 @@ module.exports = () =>
.then(res => res.text())
.then(text => {
const dom = new JSDOM(text.replace(/<\/?(span|strong).*?>/gi, ''))
const containerMarkup = dom.window.document
.querySelector('.einleitung + div > div > .toggler_container')
.innerHTML.replace(/<\/?(div).*?>/gi, '')
return [
...dom.window.document.querySelector(
'.einleitung + div > div > .toggler_container'
).childNodes
...new JSDOM(containerMarkup).window.document.body.childNodes
].reduce(reducer, [])
})

0 comments on commit fc33afb

Please sign in to comment.