Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip file conversion if file type is not supported #456

Merged
merged 3 commits into from
Oct 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 60 additions & 27 deletions haystack/preprocessor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@
import tempfile
import zipfile
from pathlib import Path
from typing import Callable, List, Optional, Tuple
from typing import Callable, Dict, List, Optional, Tuple, Union
import json

from farm.data_handler.utils import http_get

from haystack.file_converter.base import BaseConverter
from haystack.file_converter.docx import DocxToTextConverter
from haystack.file_converter.pdf import PDFToTextConverter
from haystack.file_converter.tika import TikaConverter
from haystack import Document, Label
from haystack.file_converter.txt import TextConverter

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -56,7 +59,7 @@ def eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]:
offset_start_in_doc=answer["answer_start"],
no_answer=qa["is_impossible"],
origin="gold_label",
)
)
labels.append(label)
else:
label = Label(
Expand All @@ -73,9 +76,10 @@ def eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]:
return docs, labels


def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False) -> List[dict]:
def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False) -> \
List[dict]:
"""
Convert all files(.txt, .pdf) in the sub-directories of the given path to Python dicts that can be written to a
Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Python dicts that can be written to a
Document Store.

:param dir_path: path for the documents to be written to the DocumentStore
Expand All @@ -86,32 +90,46 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
"""

file_paths = [p for p in Path(dir_path).glob("**/*")]
if ".pdf" in [p.suffix.lower() for p in file_paths]:
pdf_converter = PDFToTextConverter() # type: Optional[PDFToTextConverter]
else:
pdf_converter = None
allowed_suffixes = [".pdf", ".txt", ".docx"]
lalitpagaria marked this conversation as resolved.
Show resolved Hide resolved
suffix2converter: Dict[str, BaseConverter] = {}

documents = []
suffix2paths: Dict[str, List[Path]] = {}
for path in file_paths:
if path.suffix.lower() == ".txt":
with open(path) as doc:
text = doc.read()
elif path.suffix.lower() == ".pdf" and pdf_converter:
document = pdf_converter.convert(path)
file_suffix = path.suffix.lower()
if file_suffix in allowed_suffixes:
if file_suffix not in suffix2paths:
suffix2paths[file_suffix] = []
suffix2paths[file_suffix].append(path)
elif not path.is_dir():
logger.warning('Skipped file {0} as type {1} is not supported here. '
'See haystack.file_converter for support of more file types'.format(path, file_suffix))

# No need to initialize converter if file type not present
for file_suffix in suffix2paths.keys():
if file_suffix == ".pdf":
suffix2converter[file_suffix] = PDFToTextConverter()
if file_suffix == ".txt":
suffix2converter[file_suffix] = TextConverter()
if file_suffix == ".docx":
suffix2converter[file_suffix] = DocxToTextConverter()

documents = []
for suffix, paths in suffix2paths.items():
for path in paths:
logger.info('Converting {}'.format(path))
document = suffix2converter[suffix].convert(file_path=path, meta=None)
text = document["text"]
else:
raise Exception(f"Indexing of {path.suffix} files is not currently supported.")

if clean_func:
text = clean_func(text)
if clean_func:
text = clean_func(text)

if split_paragraphs:
for para in text.split("\n\n"):
if not para.strip(): # skip empty paragraphs
continue
documents.append({"text": para, "meta": {"name": path.name}})
else:
documents.append({"text": text, "meta": {"name": path.name}})
if split_paragraphs:
for para in text.split("\n\n"):
if not para.strip(): # skip empty paragraphs
continue
documents.append({"text": para, "meta": {"name": path.name}})
else:
documents.append({"text": text, "meta": {"name": path.name}})

return documents

Expand All @@ -127,17 +145,30 @@ def tika_convert_files_to_dicts(
Convert all files(.txt, .pdf) in the sub-directories of the given path to Python dicts that can be written to a
Document Store.

:param merge_lowercase: allow conversion of merged paragraph to lowercase
:param merge_short: allow merging of short paragraphs
:param dir_path: path for the documents to be written to the DocumentStore
:param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
:param split_paragraphs: split text in paragraphs.

:return: None
"""
converter = TikaConverter(remove_header_footer=True)
file_paths = [p for p in Path(dir_path).glob("**/*")]
paths = [p for p in Path(dir_path).glob("**/*")]
allowed_suffixes = [".pdf", ".txt"]
file_paths: List[Path] = []

for path in paths:
file_suffix = path.suffix.lower()
if file_suffix in allowed_suffixes:
file_paths.append(path)
elif not path.is_dir():
logger.warning('Skipped file {0} as type {1} is not supported here. '
'See haystack.file_converter for support of more file types'.format(path, file_suffix))

documents = []
for path in file_paths:
logger.info('Converting {}'.format(path))
document = converter.convert(path)
meta = document["meta"] or {}
meta["name"] = path.name
Expand Down Expand Up @@ -222,6 +253,8 @@ def fetch_archive_from_http(url: str, output_dir: str, proxies: Optional[dict] =
elif url[-7:] == ".tar.gz":
tar_archive = tarfile.open(temp_file.name)
tar_archive.extractall(output_dir)
else:
logger.warning('Skipped url {0} as file type is not supported here. '
'See haystack documentation for support of more file types'.format(url))
# temp_file gets deleted here
return True

13 changes: 13 additions & 0 deletions test/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from haystack.preprocessor import utils
from haystack.preprocessor.cleaning import clean_wiki_text


def test_convert_files_to_dicts(xpdf_fixture):
documents = utils.convert_files_to_dicts(dir_path="samples", clean_func=clean_wiki_text, split_paragraphs=True)
assert documents and len(documents) > 0


def test_tika_convert_files_to_dicts(tika_fixture):
documents = utils.tika_convert_files_to_dicts(dir_path="samples", clean_func=clean_wiki_text, split_paragraphs=True)
assert documents and len(documents) > 0