Skip to content

Commit

Permalink
Broader XRechnung support added
Browse files Browse the repository at this point in the history
smalot\pdfparser used by
horstoeko\zugferd\ZugferdDocumentPdfReader::readAndGuessFromFile('...'); fails to extract xml with some pdf-documents. Workaround added as proposed by [https://github.com/smalot/pdfparser/issues/740#issuecomment-2576466886](url)
  • Loading branch information
SourcePot committed Jan 22, 2025
1 parent 856073c commit f8584c4
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 34 deletions.
1 change: 0 additions & 1 deletion src/php/Foundation/Filespace.php
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,6 @@ public function file2entry(string $file,array $entry,bool $noUpdateButCreateIfMi
// analyse pdf if any parser is selected
if (!empty($entry['parserMethod'])){
$entry=$this->oc['SourcePot\Datapool\Tools\PdfTools']->attachments2arrSmalot($file,$entry);
$entry=$this->oc['SourcePot\Datapool\Tools\ZUGFeRD']->file2entry($file,$entry);
}
// update entry
$entry=$this->oc['SourcePot\Datapool\Foundation\Database']->updateEntry($entry,$isSystemCall,$noUpdateButCreateIfMissing);
Expand Down
4 changes: 2 additions & 2 deletions src/php/Tools/FileContent.php
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ private function addUnycom(array $entry,string $text):array
{
$entry['UNYCOM']=array();
$pList=$fList=array();
preg_match_all(\SourcePot\Datapool\Tools\MiscTools::UNYCOM_REGEX,$text,$matches,PREG_OFFSET_CAPTURE);
preg_match_all(\SourcePot\Datapool\Tools\MiscTools::UNYCOM_REGEX,$value=str_replace('-',' ',$text),$matches,PREG_OFFSET_CAPTURE);
foreach($matches[0] as $match){
$prefix=substr($text,$match[1]-10,10);
$prefixComps=preg_split('/[^A-Za-z0-9 ]+/',$prefix);
Expand All @@ -73,7 +73,7 @@ private function addUnycom(array $entry,string $text):array
} else {
$prefix='';
}
$case=substr($text,intval($match[1]),17);
$case=substr($text,intval($match[1]),19);
$unycomArr=$this->oc['SourcePot\Datapool\Tools\MiscTools']->convert2unycom($case,$prefix);
$pList[]=$unycomArr['Reference'];
$fList[]=$unycomArr['Family'];
Expand Down
5 changes: 3 additions & 2 deletions src/php/Tools/MiscTools.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
final class MiscTools{

//public const UNYCOM_REGEX='/([0-9]{4})([XPEFMR]{1,2})([0-9]{5})([A-Z ]{0,4})([0-9 ]{0,3})/u';
public const UNYCOM_REGEX='/([0-9]{4})(\s{0,1}[XPEFMR]{1,2})([0-9]{5})([A-Z ]{0,5})([0-9]{0,2}\s{0,1})/u';
public const UNYCOM_REGEX='/([0-9]{4})([ ]{0,1}[XPEFMR]{1,2})([0-9]{5})([A-Z ]{0,5})([0-9]{0,2})\s/u';

public $emojis=array();
private $emojiFile='';
Expand Down Expand Up @@ -1032,9 +1032,10 @@ public function convert2unycomByKey($value,string $key='Country'):string
public function convert2unycom($value,$prefix=''):array
{
$value=strval($value);
$value=str_replace('-',' ',$value);
$keyTemplate=array('Match','Year','Type','Number');
$regions=array('WO'=>'PCT','WE'=>'Euro-PCT','EP'=>'European patent','EU'=>'Unitary Patent','AP'=>'ARIPO patent','EA'=>'Eurasian patent','OA'=>'OAPI patent');
preg_match(\SourcePot\Datapool\Tools\MiscTools::UNYCOM_REGEX,$value,$matches);
preg_match(self::UNYCOM_REGEX,$value,$matches);
if (empty($matches[0])){
return array('Match'=>'','Year'=>'9999','Type'=>'Q','Number'=>'99999','Region'=>'XX','Country'=>'XX','Part'=>'99','isValid'=>FALSE);
}
Expand Down
48 changes: 20 additions & 28 deletions src/php/Tools/PdfTools.php
Original file line number Diff line number Diff line change
Expand Up @@ -149,43 +149,35 @@ public function attachments2arrSmalot($file,array $entry=array()):array
{
$pathinfo=pathinfo($file);
$context=array('class'=>__CLASS__,'function'=>__FUNCTION__,'file'=>$pathinfo['basename'],'fileName'=>$pathinfo['filename'],'fileExtension'=>$pathinfo['extension'],'attachments'=>0,'embedded'=>0);

$pdfParser= new \Smalot\PdfParser\Parser();
$pdfContent=$pdfContent = file_get_contents($file);
try {
$context['attachmentsFailed']=array();
$pdfParsed = $pdfParser->parseContent($pdfContent);
$filespecIndex=0;
$filespecs = $pdfParsed->getObjectsByType('Filespec');
foreach ($filespecs as $filespec){
$context['Filespec'][$filespecIndex]=$filespec->getDetails();
$context['attachmentsFailed'][$filespecIndex]=$context['Filespec'][$filespecIndex]['F'];
$filespecIndex++;
}
$embededIndex=0;
$embeddedFiles = $pdfParsed->getObjectsByType('EmbeddedFile');
foreach ($embeddedFiles as $embeddedFile) {
$newEntry=$entry;
$newEntry['fileName']=preg_replace('/[^a-zäüößA-ZÄÜÖ0-9\.]+/','_',$context['Filespec'][$embededIndex]['F']);
$newEntry['fileContent']=$embeddedFile->getContent();
if (!empty($newEntry['fileContent'])){
if (strpos($newEntry['fileContent'],'rsm:CrossIndustryInvoice')!==FALSE){
$context['attachmentFailed']=$context['attachmentsFailed'][$embededIndex];
$this->oc['logger']->log('info','Method "{class} → {function}()" found embedded XRechnung-file "{attachmentFailed}" in "{file}". No additional entry was created, instead the file content will be added to the entry Content-key.',$context);
unset($context['attachmentsFailed'][$embededIndex]);
$embededIndex++;
continue;
$fileDetails=$filespec->getDetails();
if($filespec->getHeader()->has('EF') && $filespec->getHeader()->get('EF')->has('F')) {
$context['embeddedFileName']=$fileDetails['F'];
$embeddedFileContent=$filespec->getHeader()->get('EF')->get('F')->getContent();
if (!empty($embeddedFileContent)){
if (stripos($embeddedFileContent,'rsm:CrossIndustryInvoice')!==FALSE){
// XRechnung
$entry=$this->oc['SourcePot\Datapool\Tools\ZUGFeRD']->xmlString2entry($embeddedFileContent,$entry);
$this->oc['logger']->log('info','Method "{class} → {function}()" found embedded XRechnung-file "{embeddedFileName}" in "{file}". No additional entry was created, instead the file content will be added to the entry Content-key.',$context);
} else {
// misc embedded file
$newEntry=$entry;
$newEntry['fileName']=preg_replace('/[^a-zäüößA-ZÄÜÖ0-9\.]+/','_',$context['embeddedFileName']);
$newEntry['fileContent']=$embeddedFileContent;
$newEntry['Name']=$pathinfo['basename'].' ['.$newEntry['fileName'].']';
$newEntry=$this->oc['SourcePot\Datapool\Tools\MiscTools']->addEntryId($newEntry,array('Source','Group','Folder','Name'),'0','',FALSE);
$this->oc['SourcePot\Datapool\Foundation\Filespace']->fileContent2entry($newEntry);
}
} else {
$this->oc['logger']->log('notice','Method "{class} → {function}()" found empty embedded file "{embeddedFileName}" in "{file}". No additional entry was created.',$context);
}
$newEntry['Name']=$pathinfo['basename'].' ['.$newEntry['fileName'].']';
$newEntry=$this->oc['SourcePot\Datapool\Tools\MiscTools']->addEntryId($newEntry,array('Source','Group','Folder','Name'),'0','',FALSE);
$this->oc['SourcePot\Datapool\Foundation\Filespace']->fileContent2entry($newEntry);
}
unset($context['attachmentsFailed'][$embededIndex]);
$embededIndex++;
}
if ($embededIndex<$filespecIndex){
$context['attachmentsFailed']=implode(' | ',$context['attachmentsFailed']);
$this->oc['logger']->log('error','Method "{class} &rarr; {function}()" failed to extract embedded files "{attachmentsFailed}" from "{file}"',$context);
}
} catch (\Exception $e) {
$context['error']=$e->getMessage();
Expand Down
2 changes: 1 addition & 1 deletion src/php/Tools/ZUGFeRD.php
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public function file2entry(string $file, array $entry):array
{
$pathinfo=pathinfo($file);
if (!is_file($file)){

// nothing to do
} else if ($pathinfo['extension']==='pdf'){
$entry=$this->pdf2entry($file,$entry,$pathinfo);
} else if ($pathinfo['extension']==='xml'){
Expand Down

0 comments on commit f8584c4

Please sign in to comment.