Skip to content

Commit

Permalink
Merge pull request #897 from basedosdados/staging/fix_cno
Browse files Browse the repository at this point in the history
[BugFix] br_rf_cno
  • Loading branch information
folhesgabriel authored Nov 22, 2024
2 parents d73e56d + 1eca79e commit c0802b5
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 15 deletions.
4 changes: 2 additions & 2 deletions pipelines/datasets/br_rf_cno/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

class constants(Enum):

URL = "http://dadosabertos.rfb.gov.br/CNO/cno.zip"
URL_FTP = "http://dadosabertos.rfb.gov.br/CNO"
URL = "https://arquivos.receitafederal.gov.br/dados/cno/cno.zip"
URL_FTP = "https://arquivos.receitafederal.gov.br/dados/cno/"

TABLES_RENAME = {
'cno.csv': 'microdados',
Expand Down
58 changes: 45 additions & 13 deletions pipelines/datasets/br_rf_cno/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,39 +19,71 @@
from pipelines.utils.utils import log


#NOTE: O crawler falhará se o nome do arquivo mudar.
@task
def check_need_for_update(url: str) -> str:
"""
Checks the need for an update by extracting the most recent update date from the CNO FTP.
Checks the need for an update by extracting the most recent update date for 'cno.zip' from the CNO FTP.
Args:
url (str): The URL of the CNO FTP site.
Returns:
str: The date of the last update in the original source in 'YYYY-MM-DD' format.
str: The date of the last update in 'YYYY-MM-DD' format.
Raises:
requests.HTTPError: If there is an HTTP error when making the request.
ValueError: If the last update date is not found in the URL.
ValueError: If the file 'cno.zip' is not found in the URL.
"""
log('---- Extracting most recent update date from CNO FTP')
response = requests.get(url)


response = requests.get(url)
if response.status_code != 200:
raise requests.HTTPError(f"HTTP error occurred: Status code {response.status_code}")


soup = BeautifulSoup(response.content, 'html.parser')
element = soup.select_one('table tr:nth-of-type(4) td:nth-of-type(3)')
rows = soup.find_all('tr')


max_file_date = None

# A lógica é simples: processa cada 'table data' (td) de cada linha 'tr'
for row in rows:
cells = row.find_all('td')


if len(cells) < 4:
continue


link = cells[1].find('a')
if not link:
continue

name = link.get_text(strip=True)
if name != "cno.zip":
continue


date = cells[2].get_text(strip=True)
max_file_date = datetime.strptime(date, "%Y-%m-%d %H:%M").strftime("%Y-%m-%d")
break

if not max_file_date:
raise ValueError("File 'cno.zip' not found on the FTP site. Check the api endpoint: https://arquivos.receitafederal.gov.br/dados/cno/ to see folder structure or file name has changed")

log(f"---- Most recent update date for 'cno.zip': {max_file_date}")

return max_file_date






if element:
date_text = element.get_text(strip=True)
date_obj = datetime.strptime(date_text, "%Y-%m-%d %H:%M")
formatted_date = date_obj.strftime("%Y-%m-%d")
log(f'---- The last update in original source occurred in: {formatted_date}')
return formatted_date

else:
raise ValueError(f"The Last update data was not found in --- URL {url}. The website HTML code might have changed")


@task
Expand Down

0 comments on commit c0802b5

Please sign in to comment.