Skip to content

Commit

Permalink
Merge pull request #75 from qld-gov-au/QOLDEV-1015-ckan-2.11
Browse files Browse the repository at this point in the history
[QOLDEV-1015] update Frictionless
  • Loading branch information
ThrawnCA authored Jan 14, 2025
2 parents 34fc858 + a1e1ba0 commit 9c3a575
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 49 deletions.
54 changes: 6 additions & 48 deletions ckanext/qa/sniff_format.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# encoding: utf-8

from collections import defaultdict
import csv
import logging
import os
import re
import magic
import six
import subprocess
import tempfile
import xlrd
import zipfile

Expand Down Expand Up @@ -222,62 +222,20 @@ def is_psv(buf, **kwargs):
return _is_spreadsheet(buf, 'PSV', '|')


def _messytables_extract_row_lengths(buf, format_, delimiter=None):
# Return a list containing the count of cells in each row,
# using messytables.CSVTableSet
import messytables
buf_rows = six.BytesIO(six.ensure_binary(buf))
if delimiter:
table_set = messytables.CSVTableSet(buf_rows, delimiter=delimiter)
else:
table_set = messytables.CSVTableSet(buf_rows)
try:
table = table_set.tables[0]
row_lengths = []
# Iterate through the table.sample (sample because otherwise
# it will barf if there is an unclosed string at the end)
for row in table.sample:
if row:
# Must have enough cells
row_lengths.append(len(row))
return row_lengths
except messytables.ReadError as e:
log.info('Not %s - unable to parse as a table: %s', format_, e)
return None


def _frictionless_extract_row_lengths(buf, format_, delimiter=None):
# Return a list containing the count of cells in each row,
# using frictionless.Resource
import frictionless
resource_kwargs = {"format": "csv"}
def _extract_row_lengths(buf, format_, delimiter=None):
# Return a list containing the count of cells in each row, if CSV
row_lengths = []
if delimiter:
dialect = frictionless.Dialect(descriptor={"delimiter": delimiter})
resource_kwargs['dialect'] = dialect
try:
# TODO: Use 'delete_on_close' once we can guarantee Python 3.12+
with tempfile.NamedTemporaryFile(delete=False) as tmpfile:
try:
tmpfile.write(six.ensure_binary(buf))
tmpfile.close()
rows = frictionless.extract(tmpfile.name, **resource_kwargs)
log.debug("Found [%s] row(s) in buffer", len(rows))
finally:
os.remove(tmpfile.name)
for row in rows:
for row in csv.reader(six.StringIO(buf), delimiter=delimiter or ','):
row_lengths.append(len(row))
return row_lengths
except frictionless.exception.FrictionlessException as e:
except csv.Error as e:
log.info('Not %s - unable to parse as a table: %s', format_, e)
return None


def _is_spreadsheet(buf, format_, delimiter=None):
if toolkit.check_ckan_version('2.10'):
row_lengths = _frictionless_extract_row_lengths(buf, format_, delimiter)
else:
row_lengths = _messytables_extract_row_lengths(buf, format_, delimiter)
row_lengths = _extract_row_lengths(buf, format_, delimiter)
if not row_lengths:
return False

Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
xlrd==1.2.0
frictionless==4.40.11
progressbar==2.5
six>=1.12.0 #in ckancore

Expand Down

0 comments on commit 9c3a575

Please sign in to comment.