From 026f849769e0d888b6ea9000856be5eacd35b280 Mon Sep 17 00:00:00 2001 From: Jake Stockwin Date: Tue, 23 Jun 2020 13:45:51 +0100 Subject: [PATCH] [loaders] Include text from figures when all_texts=True Closes #98 --- docs/source/example_files/figure.pdf | Bin 0 -> 8956 bytes .../examples/extracting_text_from_figures.rst | 45 ++++++++++++++++++ docs/source/examples/index.rst | 2 + py_pdf_parser/loaders.py | 13 ++++- tests/data/image.pdf | Bin 0 -> 8956 bytes .../test_extracting_text_from_figures.py | 24 ++++++++++ tests/test_loaders.py | 18 +++++++ 7 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 docs/source/example_files/figure.pdf create mode 100644 docs/source/examples/extracting_text_from_figures.rst create mode 100644 tests/data/image.pdf create mode 100644 tests/test_doc_examples/test_extracting_text_from_figures.py diff --git a/docs/source/example_files/figure.pdf b/docs/source/example_files/figure.pdf new file mode 100644 index 0000000000000000000000000000000000000000..bd5a95ec54574bda93ed0128ba367fd327d78d04 GIT binary patch literal 8956 zcma)hWl){XmUTEdL4t;$2X{L-L4rf@g9f*QI|qkg!5xA-1b26L2zqe$1cC*3Chx8J zYTo(oojdcVYj-`}{nTDvd)Hd4X_O_TSU6a@(P(zhj&IR8*n#Xodt)m!K|wYtOB*P} ziA~DJ2nvyanAn>_*yJI0=1>a`AU8KJ&woyUoP2!z!oq0IP$!6yEt-4A93O_(SM}B( z-lzN*<02dAoR26ML+sr-DE`HvgQms|6%ik48|!mqDj_-FAV0+#15#K{3R6TP%1hFR zxk0dZl4E)G+-2!Ab?YuwI@2y4a#9s@`T>1DU7}6dZEjw6lg`->tIqrHH#|0+Cm!6^ zFFJn^i=k4`f~EcNo*i60X;I&Lq4PKsFf)`91ykXRS)2yoW3`Sbk?@B$hgKICx4MnY za4-?zQc}oYZ+N|ohHu@qKK#C0$_s+*(XQB<$F56BMC*M)fho2GztIYD*v_ND5++IM zM(jOCiRtQc5?wm;v?k}7pn+g%))0+@NlrxF_w|HvsN({wOK9o zmg?T&9)YA#3?%AgV>5a@;jU9xwR!QDWXWM^7(6nv3UYXjq9{x^~}si_%y~RYQY)`b%t-V`);-q z;Hy4roC`R{0Xrm0M^u^tpQc6%**RMf%yXl>Lv@uEcc&#>6NtVJNHNJ)s8NlJeP{Tb zWYeb|wqtJjS}0_Ov>+zV?i`Q(z;#BfO{I!+*ZG_fPqx7#pSc9EClXXYv3+m*Ee(5J z;Yvb=3Oi(ms_JfQ9?r7;z3zhyixVql8@Fhb!ZqdFIb*K_rh3f_YD=0Qo6^;IpOqFs zY+k`@ifO?tuL{1QTH32FSTO$lIICB|&FFGKq49BL7+-^NQhK?nl!#(de1>_pH~_kR z5i2E{>2#oNAt5`bg3QmPWPZ?0#xPu_VlBoX=jYx^1=Jb5Olnf_@v?+!;Pz5wv0hQ^r+#~f*bzUoq5VFkgL~O-h zS`$TkFVD2rlbhSy!W{k9?d%rw%<_H8#&bpBD#5%5_auWP)n3uc?VRWrtuUV&*aB`BVm zQi>8g$c9E+*E~7zY&-z0XZTYN?uq9Xi~Nxrx6>kca{MgS&l>Q&9jG*ZCd+wwx4CoG zbQXiCYa1MsmfJ-Byo&Y0x;ct)3o~tI-rN9f^<^zR_L=?c2@|T@3cP{cAwQjc7HFs2 zi~+Dhdb^f4vBn)#x=$<4*vhtgIc_uk+#K)Q%5onbZQH%LGrBM)I&g<3QQheEN3jONF&j#Po`HFYHqkH7R)qi`oto_=Tg-kyxfEBI^#r;#aEF2rEyGl1I+z=uZ zjgNM-7*Xs>m<;EQpi?9=A_>24ZwyHwmCMoDYvIq$)a(c$@JEQ`Bsgc@oFsT>8Ge{> zlW2U(m@lTm*H6Oh`j`(Z<=$Ob%jMNrLF)@xlPI@QejR{oEu$vCBDd5#SZehcdKYVm zM#juC`g_Fs$I)ROgvU{o_l-m6qAg)uy*!AGhg%GAI%x{t5q@U4HiMGs=|->w zvrY!YEUrGbhpo57F~1E^78liedC3B!T*?oA2(POF6yUC%$ptA4`*!m0klh>?q{$v(`M8+(Yii4+Vb62&Fcn61Cl42dberq{>AIt!-OPsl_cw)aJTm`>^gYy8`Wtr~rt{BerqX>_n;W{gL8ynIC zoD|qaKScZe-m&ra7KtP-J9FeAOt;-M!5`-7CZsb>1zym?PvSWwz=38k1p5#UkO(si zck=ONU_Hh1gsF!+RAUr#_IoA5XF3!#OGB!5{-)&Acj$r)0BgBSKK65sP&17lw;fwKS}K6lHO;iQY(u+-F?Gi;?^6W{{td5A zUaFbxnp;5vB}6Ltxi`FLxfV&|zE9B9L8YI{B2p(z?Zgc7SSj2yX6N+zaG`W?mFg2W z=$ES!GgVh+W>W!}KGUl^gq!dwuj(bIPF`8n$vsUM~!%nhpAVsS?))b;^`(V5I+<6KOf;fcta70at+XbPV{ z-;FGMTMHD4*LH2pSQJ{!sljMuW_}^8!ggU*(vR#q8Rm`9r()FU zDOM=PU)AZc{xI$6IWTqwD-owwtgjQ|)>g-|SuDkF&l2l8f>QB|xW}}0#6E|Erh?LE zO=|WHdsu$8%$J2uRx)PmsW%MnvN7$)Fi-P4HM1!4$;3}%`qTWrcNcGWthf96`K!@a z4$b7UN$y=D47=^ZFmSl=KrB(Jom^9qQ~{r=?&qeBgcRMXD&AS*UkV!EH`U^~Jfq+u z4FQ}Iet;Twz&0?8A3AdIEcg=h)bK3uBIR4wHA${2_Av?kU{Ol6POz7!Xw_udjw(B{ zq`2x|vt$xKb6}AsWX5cVEGWE_>Jb%n!Mo>je7Eg(q2JQA-B8O-OmKpAqwq#uZ6xIE3i(VEgw zm`t@&NSQ~aplW!1M!yd|q%;cD8%J@!Ft(P7D%7Z)n?JO=Y6vDWsVJG(B~~e$)wU|H zsPU;GxXi8w^ik58<3xzHqgpYk+gojP-Hlvd(>)Zc>mENyc+0am9Gw5EGI)j^sA{@~ z!tyk|sTH@PxN%O)cSc+cl1S-I0I6?fX0%*5uqAe4j|Y)LhyxrmYr^H3v9&7-r}Coc zBV(qcQx4Thy$lUs=?O?*orDQ3(7Ilk1(OQCNsZG=(MoLQsKNgg5tfV-g5yH(xHxod zaPv@w>F0C%>h{7w`{F2Us&-GoOMHyX)7YN=>q56J;Wx;?pCLdl)G-`{62~sDB5Rpe-L)WP&g%b&mI-EoNTqF zsDn1w>FXpOy!-7@u?lDt_N_Kc?0LaH(*!CXjv3BirRPM zj)rQ~q|Fg>*1YyjJuHzsC0!38iR7Hb9D2w7Whu|8gmg`CW&01`n+mar($Z1(d33+2 z3`T1A?^xwpT;?-bnQ;3yL+i;bx#p)n|gKyJX{ZFW9G(hrLtKf^JC3O~a7 z>gVe-?c#4>Yfd2tEHD~aECXp^wBq?8{c|n!YhuKM#R{UM8cD~8?6+gA$|6^FGd*&F zbe77itP06_{ov)xCXzs5QrW@JoI}v(ClZ)5)es3|Ll-2uGZS*O5eGTrE0L9=26ozda7uuo+L)udSc~R6v#%k|$a~k&Oa!KqnPRG?Uv4Rc` zTX_87j=2R@lSt&EJqyW(9ZdN+b>E$!D@lpzJ9T3_AnC%nb>qs=L-7P&>jj;NDH(|w zIjNs(O9$5V%x&78l`AZ!tfs9dAGOz94(1WR{+L-%33e+f8vV3a>ZIv$nBh_>;Fd{O zxNEKAghLuq854p#`3lKR$#s#@$#s!(CQR-)M#ByRG0-FP*YYI#XW4=c8wMfW`m_G_ zu1ncZ_}mv|Y$WF!9Jdxal1n3cLk6|;!or?V9jEgTd?%s&Hu`*>MTXutgG8hP);^|c z^Z+s0bwj5jRy;@VFH{Uk7Ew)>#!|ef)h?@cYkc&H&k|Be3uO4p!e*gs9`-o_J7&f%1ls1BT|7t$@ z8-iuvrOx83q9vgt2#CZeq^o@{J%k7-wV6+`@#)r0>5grWIQv%0yS|^gL>Ce`8h21V z?Tgj=!W0ze*2|(gaN^}|fuhs2Oj-UMbLE46<}=##^7gKju&S#;;z>!BXkCOEbbk~@ zYAWH@QG^An-4;akBErwAb#PqD*sB{OE7oY4ih6<1tlvO*klC2pNF*8-VLcQ)$l7!! zog{m5?C+lpwN5!}dCl3`{QG%(&Hw%Pfz}9vANSeZ^?EaC5IfWV)XsmRzoa(@$Nxct z13{esO?B(ViQ4tDzmdEV4zhoPoJRD`f~VR@MXjOZL-o+`Ym%`qh)t*2aIHVLeGd4^ zJj`Z}F(1T7B94VuNIE@g7?Kxn1vaz|y|$fM3k{Z1`p&t=9a2jVT*+0RS~MG*yAQU8 z4tm!#yXF&~RdDx*7X%RPRQc=PW#)6xfONOJl!{^Ds-F6KTcw*MGm(A73=l^B+4z*~#8S6#~^^`{S6fsX^SKdTf&JP-#`@AA1Ci>(3tWpY!dYK;FMX zVd4MTJ^_Mwc>iXYEFZ@!$n}3f4?exa=Wj&bq2`1sR=%c0A)8(A?*q54^}0fi1D?H$ z4C*4>f3v9BYi^a>$o?= zUW)N08D3IhLChwkl|6ztc`4WTTap+L29HakvScDXTU3PYFK%a(n4J8(D=P>uj_6U>O>l_zKLl(-H-QWj#cX^W{u^X1WHa1&ET7c?tN53N1u$z)! zs9c{KZO@wy;Z7Lc!+&E)PUX7%i)-cn1Lpr`M*%r_xc-i(LR1ZAFFU$~Tj%!YoStah zkdL8oaPT?>HlaZTjcC7LUN?%VBuZQhZ4BJ{iMkTfEYn1CMh1OVuUGio;Y8mmo@|Vn zmO$K;oa*3d(p{5gc$vFUT+D`t5tQHJjLFY&2wlM)f$39&&pm%i^jkQcYg#eZO%E;S z`^Y8jK8dXrsiu2Q_=Z=xOpt;J2J$@$uk<&z)s`E*I<|E=Y&yx4-;@7~MEU+NiE?s) z{+_7bw2E~g`>}HFVw>!O&cfd90q`ES*|ED2UC5JU>_}URQtka@zj7 zB_li0qh$BPqkWkNBE01{90E@vZOmQF3NQQ;Yh!j>URQ{Tzr@;BoMe#e7uN&Tic+jtfyCcB}5AG{Z#$gqoyK@be}pX&~8AM`iV^Z)e=aaoDKs`;;O!7l#qYMFzJ=kIqCrQxN9t4Wj{JCrk69BFACJP}#( zs|@Y-r?$%#iW(XoIf_;V^BY4H88ILh!!Q+4CO%z)ESwqm2Bix1Gz3^JTxx5p=UJ>_ zeFLp%J!xz!auJeSdk=jyp>u68S8|zLPrVq=@bbO+9P{vW;)WP1^92PVK#~L7e&|a` z7eS3lj7I)~ z=iPobs?Z#ZENNF}zq)T#XTEh-1#*I<3Nr>2#cwG;c*nlQ#`zQ=PZfip-!(;$Qm*i= zFa7{sJdMJ z@vPZJhvlr3EmERP5#t=gSDv8o!W$JSp$eEPW9EEHiNTh%WKrQkiA@D-=608WwgS@? z6a@J$Q|R)QBC&8iJS@CWto|JmBPC1N#1w4ya1O7a$dYl+aCmk1)SkWcv0fn27cFM` zq0AR>@FaAh5puNru0w#5+5KUn`1o`14Z2#mhBOTf36iFLdNaF|shufRB6YhTaZ6<# zhcRqs%7pW!J?iKKWgG%EwyZc;sOWG=r2R$QUDjq;><&Oznfi9{SF18pJM&D$GcD#@ zEZJ@9j-TF_s*5xx7l)JqgtOK{I zTRy!L<^%FDb13)k4v3F#U>G_p9P4@r-{r|pG-*`tv2;A8y?%Ut+R||_L8LI5OwXX* zr-a{d>&252I)eGU=8-q%S~&*<+KGVqMQdU78bd4`VRJvt)oD)SGl-7IWh|n4T9&HH zH+M;{pPcAhK+DIjaYpow7^9N>-NvId)&B5ld?o<0BCF$tgiqr8a^m5_57MOh2(Z}= z7r4`R6A>oEhq{I%D#MFwv~Op*dVobaPXx?ac(uPaN$$d^uJZ?yW~c~^WeIa8m$1UVHx45rK;N6m=i7s=zgYAcp$qDFgHO zCLudk{C5hjDe#(HFR^m3LtPH<{g2oY0_X;(D$+(zYhA$-$h-i|9Ar z?8~|f=%UU>by&J?@Mn`^xv1g_HkfU?`UXE0C!*OD)NlyR55bJ2Qmbs#rq=BVuB4Q& z&z&#}8E3h<5)_U`6_jMa2pciMAU?1Aqhz5w^P2|TV=t;6Mcw3AHZ-R9HR6#vj_6zq zyg1AL6wdI^$FsBJoqay7X0>fiO^`*GcbWeqLEb3+|6UF(n0O2>BLv+4qZj z)3?A&Zn}VZRM81bR$!?v?po;_o+jVA~z{ zu~cU&-8CV67p>X3dec{otRaUn!jdUhQ;tMvK~!Yj7e2*nm=4X5MQ~_N7?(2e`_7sM zmUxeN_R7mLS6B;hvreGA+}qDIh~70UU+~jY$WDQ`#`|o$)@1Qjm`ROm!lK;t5xhrw zu6n{?wq)w~r_8meV^%R9=cnJjv_)8#X#w>=^nOMteAnH4>nn{cP(d9jcKS+cO>XuF zxlLq0Z~tkY^GMsB|D2<{vP~~0%N=~U&j8DFs;YugLj){e1htlZ_5B@L1waL2k5{T+c5rlMd&WfjH(QioYq@BhTwk681+k8a9 zo2*kEk9#JcsMLjI-qV8B8AAA5Dp42FIP$kd28XbTNY-fu3nqEr56we;NQj21%2MCw8sKZ}xbrZ&oi93X`vq*e~txGRbb-*kr zO9G|SWGHLTFBP{IR=%vqT=Uo8vz~SB)zHQ8%R~Cg8;dpNscb1=1s8<)rKP;M`TNnD zB3*!BM#6NiN{P!d57ohh6#CWx1@Oq#P-Jeig5+HT&(<5Eivw=(AnSI_)cDdF3OrSys>x`aT98rW)Wc0Nf!rO27aRB&yLGXiZ;lZwsHqG>jN50;S%i6*~S z&4=4E4@0E=M4jMat+$&DXd>G|ld|){sY^tkZEf7-qdWROIH3_PE^UZR&$Nafkd~6x zd2klf$eE1ya(f*&0>DwmY6%6Uo7LM6Dm~L+klPza)tu=`2P(z;^XG~`%gR6bYhnI@IIeb@!2D)m$QXsx^okS`OnN?zTTdR#@p*j zXqIP`Q8sSv>)J4(Lx(oh9jc0l?l?y~+ilz-D5LnxDZc@S`Xf_F0KWlfMf>3pfK+JA z0rV(-;@=3_Lrv;UbEuICUgcv8pCMpT8B(l=_l;wG%Ek16zv12Qx_+ z-rlu!+zMUn*Z7c=*J9vPZ}1U1ts>jI)q_F4ai(eg^Ip&nMQtSyBu>2rZcw|Y1;G?N z8$~K~Km<@cak-h`qY=)bMb$#TYQ(S@8!vG()akgZG^c~;W|`e-O9TmBa(l(Ax}~d!eoM5OQ1XfVbU{RdR zHc%%2`os73uy|A+A$1N`qfz>-82rjnVh&o|o*k~IFcf_Uj=|&3Z)JseW5hr)PTb%xTD#`cds=n_V|IPQRzqxonzfC(ErV|uMQql9dMbOVxX*B^wA(NLT(=<3|F3Yh zzz{Pwu)RI>AI1ilP1erL9{5+^_!CO#2%2#6m_c|sxs6SExcK>ujZFDXK4Ngae5y21^_e<@l&j)L_u><@{50)=pex?-i~%*{R>FT+BB!lrK?m%#cN+; n?m2pV>qkBek38|uh&V%yoS^Pb5HmDB9v%)f8X8F@DYX9p3#+Wi literal 0 HcmV?d00001 diff --git a/docs/source/examples/extracting_text_from_figures.rst b/docs/source/examples/extracting_text_from_figures.rst new file mode 100644 index 00000000..7a7e138c --- /dev/null +++ b/docs/source/examples/extracting_text_from_figures.rst @@ -0,0 +1,45 @@ +.. _extracting-text-from-figures: + +Extracting Text From Figures +---------------------------- +PDFs are structured documents, and can contain Figures. By default, PDFMiner.six and +hence py-pdf-parser does not extract text from figures. + +You can :download:`download an example here `. In the +example, there is figure which contains a red square, and some text. Below the figure +there is some more text. + +By default, the text in the figure will not be included: + +.. code-block:: python + + from py_pdf_parser import load_file + document = load_file("figure.pdf") + print([element.text() for element in document.elements]) + +which results in: + +:: + + ["Here is some text outside of an image"] + +To include the text inside the figure, we must pass the ``all_texts`` layout parameter. +This is documented in the PDFMiner.six documentation, `here +`_. + +The layout parameters can be passed to both :meth:`~py_pdf_parser.loaders.load` and +:meth:`~py-pdf-parser.loaders.load_file` as a dictionary to the ``la_params`` argument. + +In our case: + +.. code-block:: python + + from py_pdf_parser import load_file + document = load_file("figure.pdf", la_params={"all_texts": True}) + print([element.text() for element in document.elements]) + +which results in: + +:: + + ["This is some text in an image", "Here is some text outside of an image"] diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst index 31e0fce3..f69b2e7f 100644 --- a/docs/source/examples/index.rst +++ b/docs/source/examples/index.rst @@ -7,6 +7,7 @@ Below you can find links to the following examples: - The :ref:`order-summary` example explains how to use font mappings, sections, and how to extract simple tables. - The :ref:`more-tables` example explains tables in more detail, showing how to extract more complex tables. - The :ref:`element-ordering` example shows how to specify different orderings for the elements on a page. +- The :ref:`extracting-text-from-figures` example shows how to extract text from figures. .. toctree:: @@ -14,4 +15,5 @@ Below you can find links to the following examples: order_summary more_tables element_ordering + extracting_text_from_figures diff --git a/py_pdf_parser/loaders.py b/py_pdf_parser/loaders.py index 3e3e26b5..d0b93f5b 100644 --- a/py_pdf_parser/loaders.py +++ b/py_pdf_parser/loaders.py @@ -3,7 +3,7 @@ import logging from pdfminer.high_level import extract_pages -from pdfminer.layout import LTTextContainer, LAParams +from pdfminer.layout import LTTextContainer, LAParams, LTFigure from .components import PDFDocument @@ -74,6 +74,17 @@ def load( pages: Dict[int, Page] = {} for page in extract_pages(pdf_file, laparams=LAParams(**la_params)): elements = [element for element in page if isinstance(element, LTTextContainer)] + + # If all_texts=True then we may get some text from inside figures + if la_params.get("all_texts"): + figures = (element for element in page if isinstance(element, LTFigure)) + for figure in figures: + elements += [ + element + for element in figure + if isinstance(element, LTTextContainer) + ] + if not elements: logger.warning( f"No elements detected on page {page.pageid}, skipping this page." diff --git a/tests/data/image.pdf b/tests/data/image.pdf new file mode 100644 index 0000000000000000000000000000000000000000..bd5a95ec54574bda93ed0128ba367fd327d78d04 GIT binary patch literal 8956 zcma)hWl){XmUTEdL4t;$2X{L-L4rf@g9f*QI|qkg!5xA-1b26L2zqe$1cC*3Chx8J zYTo(oojdcVYj-`}{nTDvd)Hd4X_O_TSU6a@(P(zhj&IR8*n#Xodt)m!K|wYtOB*P} ziA~DJ2nvyanAn>_*yJI0=1>a`AU8KJ&woyUoP2!z!oq0IP$!6yEt-4A93O_(SM}B( z-lzN*<02dAoR26ML+sr-DE`HvgQms|6%ik48|!mqDj_-FAV0+#15#K{3R6TP%1hFR zxk0dZl4E)G+-2!Ab?YuwI@2y4a#9s@`T>1DU7}6dZEjw6lg`->tIqrHH#|0+Cm!6^ zFFJn^i=k4`f~EcNo*i60X;I&Lq4PKsFf)`91ykXRS)2yoW3`Sbk?@B$hgKICx4MnY za4-?zQc}oYZ+N|ohHu@qKK#C0$_s+*(XQB<$F56BMC*M)fho2GztIYD*v_ND5++IM zM(jOCiRtQc5?wm;v?k}7pn+g%))0+@NlrxF_w|HvsN({wOK9o zmg?T&9)YA#3?%AgV>5a@;jU9xwR!QDWXWM^7(6nv3UYXjq9{x^~}si_%y~RYQY)`b%t-V`);-q z;Hy4roC`R{0Xrm0M^u^tpQc6%**RMf%yXl>Lv@uEcc&#>6NtVJNHNJ)s8NlJeP{Tb zWYeb|wqtJjS}0_Ov>+zV?i`Q(z;#BfO{I!+*ZG_fPqx7#pSc9EClXXYv3+m*Ee(5J z;Yvb=3Oi(ms_JfQ9?r7;z3zhyixVql8@Fhb!ZqdFIb*K_rh3f_YD=0Qo6^;IpOqFs zY+k`@ifO?tuL{1QTH32FSTO$lIICB|&FFGKq49BL7+-^NQhK?nl!#(de1>_pH~_kR z5i2E{>2#oNAt5`bg3QmPWPZ?0#xPu_VlBoX=jYx^1=Jb5Olnf_@v?+!;Pz5wv0hQ^r+#~f*bzUoq5VFkgL~O-h zS`$TkFVD2rlbhSy!W{k9?d%rw%<_H8#&bpBD#5%5_auWP)n3uc?VRWrtuUV&*aB`BVm zQi>8g$c9E+*E~7zY&-z0XZTYN?uq9Xi~Nxrx6>kca{MgS&l>Q&9jG*ZCd+wwx4CoG zbQXiCYa1MsmfJ-Byo&Y0x;ct)3o~tI-rN9f^<^zR_L=?c2@|T@3cP{cAwQjc7HFs2 zi~+Dhdb^f4vBn)#x=$<4*vhtgIc_uk+#K)Q%5onbZQH%LGrBM)I&g<3QQheEN3jONF&j#Po`HFYHqkH7R)qi`oto_=Tg-kyxfEBI^#r;#aEF2rEyGl1I+z=uZ zjgNM-7*Xs>m<;EQpi?9=A_>24ZwyHwmCMoDYvIq$)a(c$@JEQ`Bsgc@oFsT>8Ge{> zlW2U(m@lTm*H6Oh`j`(Z<=$Ob%jMNrLF)@xlPI@QejR{oEu$vCBDd5#SZehcdKYVm zM#juC`g_Fs$I)ROgvU{o_l-m6qAg)uy*!AGhg%GAI%x{t5q@U4HiMGs=|->w zvrY!YEUrGbhpo57F~1E^78liedC3B!T*?oA2(POF6yUC%$ptA4`*!m0klh>?q{$v(`M8+(Yii4+Vb62&Fcn61Cl42dberq{>AIt!-OPsl_cw)aJTm`>^gYy8`Wtr~rt{BerqX>_n;W{gL8ynIC zoD|qaKScZe-m&ra7KtP-J9FeAOt;-M!5`-7CZsb>1zym?PvSWwz=38k1p5#UkO(si zck=ONU_Hh1gsF!+RAUr#_IoA5XF3!#OGB!5{-)&Acj$r)0BgBSKK65sP&17lw;fwKS}K6lHO;iQY(u+-F?Gi;?^6W{{td5A zUaFbxnp;5vB}6Ltxi`FLxfV&|zE9B9L8YI{B2p(z?Zgc7SSj2yX6N+zaG`W?mFg2W z=$ES!GgVh+W>W!}KGUl^gq!dwuj(bIPF`8n$vsUM~!%nhpAVsS?))b;^`(V5I+<6KOf;fcta70at+XbPV{ z-;FGMTMHD4*LH2pSQJ{!sljMuW_}^8!ggU*(vR#q8Rm`9r()FU zDOM=PU)AZc{xI$6IWTqwD-owwtgjQ|)>g-|SuDkF&l2l8f>QB|xW}}0#6E|Erh?LE zO=|WHdsu$8%$J2uRx)PmsW%MnvN7$)Fi-P4HM1!4$;3}%`qTWrcNcGWthf96`K!@a z4$b7UN$y=D47=^ZFmSl=KrB(Jom^9qQ~{r=?&qeBgcRMXD&AS*UkV!EH`U^~Jfq+u z4FQ}Iet;Twz&0?8A3AdIEcg=h)bK3uBIR4wHA${2_Av?kU{Ol6POz7!Xw_udjw(B{ zq`2x|vt$xKb6}AsWX5cVEGWE_>Jb%n!Mo>je7Eg(q2JQA-B8O-OmKpAqwq#uZ6xIE3i(VEgw zm`t@&NSQ~aplW!1M!yd|q%;cD8%J@!Ft(P7D%7Z)n?JO=Y6vDWsVJG(B~~e$)wU|H zsPU;GxXi8w^ik58<3xzHqgpYk+gojP-Hlvd(>)Zc>mENyc+0am9Gw5EGI)j^sA{@~ z!tyk|sTH@PxN%O)cSc+cl1S-I0I6?fX0%*5uqAe4j|Y)LhyxrmYr^H3v9&7-r}Coc zBV(qcQx4Thy$lUs=?O?*orDQ3(7Ilk1(OQCNsZG=(MoLQsKNgg5tfV-g5yH(xHxod zaPv@w>F0C%>h{7w`{F2Us&-GoOMHyX)7YN=>q56J;Wx;?pCLdl)G-`{62~sDB5Rpe-L)WP&g%b&mI-EoNTqF zsDn1w>FXpOy!-7@u?lDt_N_Kc?0LaH(*!CXjv3BirRPM zj)rQ~q|Fg>*1YyjJuHzsC0!38iR7Hb9D2w7Whu|8gmg`CW&01`n+mar($Z1(d33+2 z3`T1A?^xwpT;?-bnQ;3yL+i;bx#p)n|gKyJX{ZFW9G(hrLtKf^JC3O~a7 z>gVe-?c#4>Yfd2tEHD~aECXp^wBq?8{c|n!YhuKM#R{UM8cD~8?6+gA$|6^FGd*&F zbe77itP06_{ov)xCXzs5QrW@JoI}v(ClZ)5)es3|Ll-2uGZS*O5eGTrE0L9=26ozda7uuo+L)udSc~R6v#%k|$a~k&Oa!KqnPRG?Uv4Rc` zTX_87j=2R@lSt&EJqyW(9ZdN+b>E$!D@lpzJ9T3_AnC%nb>qs=L-7P&>jj;NDH(|w zIjNs(O9$5V%x&78l`AZ!tfs9dAGOz94(1WR{+L-%33e+f8vV3a>ZIv$nBh_>;Fd{O zxNEKAghLuq854p#`3lKR$#s#@$#s!(CQR-)M#ByRG0-FP*YYI#XW4=c8wMfW`m_G_ zu1ncZ_}mv|Y$WF!9Jdxal1n3cLk6|;!or?V9jEgTd?%s&Hu`*>MTXutgG8hP);^|c z^Z+s0bwj5jRy;@VFH{Uk7Ew)>#!|ef)h?@cYkc&H&k|Be3uO4p!e*gs9`-o_J7&f%1ls1BT|7t$@ z8-iuvrOx83q9vgt2#CZeq^o@{J%k7-wV6+`@#)r0>5grWIQv%0yS|^gL>Ce`8h21V z?Tgj=!W0ze*2|(gaN^}|fuhs2Oj-UMbLE46<}=##^7gKju&S#;;z>!BXkCOEbbk~@ zYAWH@QG^An-4;akBErwAb#PqD*sB{OE7oY4ih6<1tlvO*klC2pNF*8-VLcQ)$l7!! zog{m5?C+lpwN5!}dCl3`{QG%(&Hw%Pfz}9vANSeZ^?EaC5IfWV)XsmRzoa(@$Nxct z13{esO?B(ViQ4tDzmdEV4zhoPoJRD`f~VR@MXjOZL-o+`Ym%`qh)t*2aIHVLeGd4^ zJj`Z}F(1T7B94VuNIE@g7?Kxn1vaz|y|$fM3k{Z1`p&t=9a2jVT*+0RS~MG*yAQU8 z4tm!#yXF&~RdDx*7X%RPRQc=PW#)6xfONOJl!{^Ds-F6KTcw*MGm(A73=l^B+4z*~#8S6#~^^`{S6fsX^SKdTf&JP-#`@AA1Ci>(3tWpY!dYK;FMX zVd4MTJ^_Mwc>iXYEFZ@!$n}3f4?exa=Wj&bq2`1sR=%c0A)8(A?*q54^}0fi1D?H$ z4C*4>f3v9BYi^a>$o?= zUW)N08D3IhLChwkl|6ztc`4WTTap+L29HakvScDXTU3PYFK%a(n4J8(D=P>uj_6U>O>l_zKLl(-H-QWj#cX^W{u^X1WHa1&ET7c?tN53N1u$z)! zs9c{KZO@wy;Z7Lc!+&E)PUX7%i)-cn1Lpr`M*%r_xc-i(LR1ZAFFU$~Tj%!YoStah zkdL8oaPT?>HlaZTjcC7LUN?%VBuZQhZ4BJ{iMkTfEYn1CMh1OVuUGio;Y8mmo@|Vn zmO$K;oa*3d(p{5gc$vFUT+D`t5tQHJjLFY&2wlM)f$39&&pm%i^jkQcYg#eZO%E;S z`^Y8jK8dXrsiu2Q_=Z=xOpt;J2J$@$uk<&z)s`E*I<|E=Y&yx4-;@7~MEU+NiE?s) z{+_7bw2E~g`>}HFVw>!O&cfd90q`ES*|ED2UC5JU>_}URQtka@zj7 zB_li0qh$BPqkWkNBE01{90E@vZOmQF3NQQ;Yh!j>URQ{Tzr@;BoMe#e7uN&Tic+jtfyCcB}5AG{Z#$gqoyK@be}pX&~8AM`iV^Z)e=aaoDKs`;;O!7l#qYMFzJ=kIqCrQxN9t4Wj{JCrk69BFACJP}#( zs|@Y-r?$%#iW(XoIf_;V^BY4H88ILh!!Q+4CO%z)ESwqm2Bix1Gz3^JTxx5p=UJ>_ zeFLp%J!xz!auJeSdk=jyp>u68S8|zLPrVq=@bbO+9P{vW;)WP1^92PVK#~L7e&|a` z7eS3lj7I)~ z=iPobs?Z#ZENNF}zq)T#XTEh-1#*I<3Nr>2#cwG;c*nlQ#`zQ=PZfip-!(;$Qm*i= zFa7{sJdMJ z@vPZJhvlr3EmERP5#t=gSDv8o!W$JSp$eEPW9EEHiNTh%WKrQkiA@D-=608WwgS@? z6a@J$Q|R)QBC&8iJS@CWto|JmBPC1N#1w4ya1O7a$dYl+aCmk1)SkWcv0fn27cFM` zq0AR>@FaAh5puNru0w#5+5KUn`1o`14Z2#mhBOTf36iFLdNaF|shufRB6YhTaZ6<# zhcRqs%7pW!J?iKKWgG%EwyZc;sOWG=r2R$QUDjq;><&Oznfi9{SF18pJM&D$GcD#@ zEZJ@9j-TF_s*5xx7l)JqgtOK{I zTRy!L<^%FDb13)k4v3F#U>G_p9P4@r-{r|pG-*`tv2;A8y?%Ut+R||_L8LI5OwXX* zr-a{d>&252I)eGU=8-q%S~&*<+KGVqMQdU78bd4`VRJvt)oD)SGl-7IWh|n4T9&HH zH+M;{pPcAhK+DIjaYpow7^9N>-NvId)&B5ld?o<0BCF$tgiqr8a^m5_57MOh2(Z}= z7r4`R6A>oEhq{I%D#MFwv~Op*dVobaPXx?ac(uPaN$$d^uJZ?yW~c~^WeIa8m$1UVHx45rK;N6m=i7s=zgYAcp$qDFgHO zCLudk{C5hjDe#(HFR^m3LtPH<{g2oY0_X;(D$+(zYhA$-$h-i|9Ar z?8~|f=%UU>by&J?@Mn`^xv1g_HkfU?`UXE0C!*OD)NlyR55bJ2Qmbs#rq=BVuB4Q& z&z&#}8E3h<5)_U`6_jMa2pciMAU?1Aqhz5w^P2|TV=t;6Mcw3AHZ-R9HR6#vj_6zq zyg1AL6wdI^$FsBJoqay7X0>fiO^`*GcbWeqLEb3+|6UF(n0O2>BLv+4qZj z)3?A&Zn}VZRM81bR$!?v?po;_o+jVA~z{ zu~cU&-8CV67p>X3dec{otRaUn!jdUhQ;tMvK~!Yj7e2*nm=4X5MQ~_N7?(2e`_7sM zmUxeN_R7mLS6B;hvreGA+}qDIh~70UU+~jY$WDQ`#`|o$)@1Qjm`ROm!lK;t5xhrw zu6n{?wq)w~r_8meV^%R9=cnJjv_)8#X#w>=^nOMteAnH4>nn{cP(d9jcKS+cO>XuF zxlLq0Z~tkY^GMsB|D2<{vP~~0%N=~U&j8DFs;YugLj){e1htlZ_5B@L1waL2k5{T+c5rlMd&WfjH(QioYq@BhTwk681+k8a9 zo2*kEk9#JcsMLjI-qV8B8AAA5Dp42FIP$kd28XbTNY-fu3nqEr56we;NQj21%2MCw8sKZ}xbrZ&oi93X`vq*e~txGRbb-*kr zO9G|SWGHLTFBP{IR=%vqT=Uo8vz~SB)zHQ8%R~Cg8;dpNscb1=1s8<)rKP;M`TNnD zB3*!BM#6NiN{P!d57ohh6#CWx1@Oq#P-Jeig5+HT&(<5Eivw=(AnSI_)cDdF3OrSys>x`aT98rW)Wc0Nf!rO27aRB&yLGXiZ;lZwsHqG>jN50;S%i6*~S z&4=4E4@0E=M4jMat+$&DXd>G|ld|){sY^tkZEf7-qdWROIH3_PE^UZR&$Nafkd~6x zd2klf$eE1ya(f*&0>DwmY6%6Uo7LM6Dm~L+klPza)tu=`2P(z;^XG~`%gR6bYhnI@IIeb@!2D)m$QXsx^okS`OnN?zTTdR#@p*j zXqIP`Q8sSv>)J4(Lx(oh9jc0l?l?y~+ilz-D5LnxDZc@S`Xf_F0KWlfMf>3pfK+JA z0rV(-;@=3_Lrv;UbEuICUgcv8pCMpT8B(l=_l;wG%Ek16zv12Qx_+ z-rlu!+zMUn*Z7c=*J9vPZ}1U1ts>jI)q_F4ai(eg^Ip&nMQtSyBu>2rZcw|Y1;G?N z8$~K~Km<@cak-h`qY=)bMb$#TYQ(S@8!vG()akgZG^c~;W|`e-O9TmBa(l(Ax}~d!eoM5OQ1XfVbU{RdR zHc%%2`os73uy|A+A$1N`qfz>-82rjnVh&o|o*k~IFcf_Uj=|&3Z)JseW5hr)PTb%xTD#`cds=n_V|IPQRzqxonzfC(ErV|uMQql9dMbOVxX*B^wA(NLT(=<3|F3Yh zzz{Pwu)RI>AI1ilP1erL9{5+^_!CO#2%2#6m_c|sxs6SExcK>ujZFDXK4Ngae5y21^_e<@l&j)L_u><@{50)=pex?-i~%*{R>FT+BB!lrK?m%#cN+; n?m2pV>qkBek38|uh&V%yoS^Pb5HmDB9v%)f8X8F@DYX9p3#+Wi literal 0 HcmV?d00001 diff --git a/tests/test_doc_examples/test_extracting_text_from_figures.py b/tests/test_doc_examples/test_extracting_text_from_figures.py new file mode 100644 index 00000000..ba5883c0 --- /dev/null +++ b/tests/test_doc_examples/test_extracting_text_from_figures.py @@ -0,0 +1,24 @@ +import os + +from py_pdf_parser.loaders import load_file +from tests.base import BaseTestCase + + +class TestExtractingTextFromFigures(BaseTestCase): + def test_output_is_correct(self): + file_path = os.path.join( + os.path.dirname(__file__), "../../docs/source/example_files/figure.pdf" + ) + + # Without all_texts + document = load_file(file_path) + self.assertListEqual( + [element.text() for element in document.elements], + ["Here is some text outside of an image"] + ) + + document = load_file(file_path, la_params={"all_texts": True}) + self.assertListEqual( + [element.text() for element in document.elements], + ["This is some text in an image", "Here is some text outside of an image"] + ) diff --git a/tests/test_loaders.py b/tests/test_loaders.py index 3ef6e90f..81637183 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -17,3 +17,21 @@ def test_load(self): with open(file_path, "rb") as in_file: document = load(in_file) self.assertIsInstance(document, PDFDocument) + + def test_load_with_text_in_image(self): + file_path = os.path.join(os.path.dirname(__file__), "data", "image.pdf") + with open(file_path, "rb") as in_file: + document = load(in_file) + self.assertIsInstance(document, PDFDocument) + self.assertEqual(len(document.elements), 1) + + with open(file_path, "rb") as in_file: + document = load(in_file, la_params={"all_texts": True}) + self.assertIsInstance(document, PDFDocument) + self.assertEqual(len(document.elements), 2) + + def test_load_file_with_text_in_image(self): + file_path = os.path.join(os.path.dirname(__file__), "data", "image.pdf") + document = load_file(file_path, la_params={"all_texts": True}) + self.assertIsInstance(document, PDFDocument) + self.assertEqual(len(document.elements), 2)