o
    Age                  	   @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
mZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZ d dlmZm Z  d d	l!m"Z" d d
l#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5 ddl&m6Z6m7Z7m8Z8 ddl9m:Z:m;Z; ddl<m=Z= e>dZ?e@g dZAerddlBmCZC ddlDmEZE ddddddZFdeGdeHfd d!ZId"eed#f deeeeeJeKf d#f  eeH f fd$d%ZLd"edeeeeeJeKf d#f  eeH f fd&d'ZMd(eeHef deeHef fd)d*ZNG d+d, d,eZOd@d-e(d.e)de(fd/d0ZPd-e(d1e)de(fd2d3ZQG d4d5 d5e-ZRG d6d7 d7eRZSd8e(d9e(ddfd:d;ZTG d<d= d=eSZUG d>d? d?eSZVdS )A    N)	lru_cache)
TYPE_CHECKINGAnyCallableDict	GeneratorListOptionalPatternTupleUnion)	normalize)warn)PDFPageAggregator)LTCharLTComponentLTContainerLTCurveLTItemLTPageLTTextContainer)PDFPageInterpreter	PDFStackT)PDFPage)	PSLiteral   )utils)T_bboxT_numT_obj
T_obj_list)	Container)PDFStructTreeStructTreeMissing)T_table_settingsTableTableFinderTableSettings)decode_textresolve_allresolve_and_decode)MalformedPDFExceptionPdfminerException)TextMapz^LT)advheight	linewidthptssizesrcsizewidthx0x1y0y1bitsmatrixuprightfontnametext	imagemask
colorspaceevenoddfillnon_stroking_colorstrokestroking_colorstreamnamemcidtag)	PageImage)PDFzSimSun,RegularzSimHei,RegularzSimKai,RegularzSimFang,RegularzSimLi,Regular)s   s   s   _GB2312s   _GB2312s   r<   returnc                 C   sh   d| v r|  dd }| d | | |d  }}nd| }}t|t|dd }t|dd | S )N   +r          )indexCP936_FONTNAMESgetstr)r<   split_atprefixsuffix
suffix_new rX   V/var/www/html/development/chatbot/venv/lib/python3.10/site-packages/pdfplumber/page.pyfix_fontname_bytes\   s   
rZ   color.c                 C   s4   t | d tr| d d pd t| d jfS | d fS )NrO   )
isinstancer   r(   rF   )r[   rX   rX   rY   separate_patterng   s   r]   c                 C   sJ   | d u rdS t | tr| }t|S t | trt| }t|S | f}t|S )N)NN)r\   tuplelistr]   )r[   	tuplefiedrX   rX   rY   normalize_colorp   s   

ra   kwargsc                 C   s   dd |   D S )Nc                 S   s(   i | ]\}}|t |trt|n|qS rX   )r\   r_   r^   ).0keyvaluerX   rX   rY   
<dictcomp>   s    z'tuplify_list_kwargs.<locals>.<dictcomp>)items)rb   rX   rX   rY   tuplify_list_kwargs~   s   rh   c                       s   e Zd ZU dZdZee ed< dZee	 ed< dde
dee ddfdd	Zdd
dZdddZdef fddZd fddZd fddZ  ZS )"PDFPageAggregatorWithMarkedContentzZExtract layout from a specific page, adding marked-content IDs to
    objects where found.Ncur_mcidcur_tagrH   propsrK   c                 C   s6   t |j| _t|trd|v r|d | _dS d| _dS )z5Handle beginning of tag, setting current MCID if any.MCIDN)r(   rF   rk   r\   dictrj   )selfrH   rl   rX   rX   rY   	begin_tag   s   
z,PDFPageAggregatorWithMarkedContent.begin_tagc                 C   s   d| _ d| _dS )z/Handle beginning of tag, clearing current MCID.N)rk   rj   ro   rX   rX   rY   end_tag   s   
z*PDFPageAggregatorWithMarkedContent.end_tagc                 C   s,   | j jr| j jd }| j|_| j|_dS dS )z^Add current MCID to what we hope to be the most recent object created
        by pdfminer.six.rO   N)cur_item_objsrj   rG   rk   rH   )ro   cur_objrX   rX   rY   tag_cur_item   s
   	z/PDFPageAggregatorWithMarkedContent.tag_cur_itemc                    s   t  j|i |}|   |S )z;Hook for rendering characters, adding the `mcid` attribute.)superrender_charrv   )ro   argsrb   r.   	__class__rX   rY   rx      s   z.PDFPageAggregatorWithMarkedContent.render_charc                       t  j|i | |   dS )z7Hook for rendering images, adding the `mcid` attribute.N)rw   render_imagerv   ro   ry   rb   rz   rX   rY   r}         z/PDFPageAggregatorWithMarkedContent.render_imagec                    r|   )zAHook for rendering lines and curves, adding the `mcid` attribute.N)rw   
paint_pathrv   r~   rz   rX   rY   r      r   z-PDFPageAggregatorWithMarkedContent.paint_pathNrK   N)__name__
__module____qualname____doc__rj   r	   int__annotations__rk   rS   r   r   rp   rr   rv   floatrx   r}   r   __classcell__rX   rX   rz   rY   ri      s   
 

ri   box_rawrotationc                 C   sp   t dd | D std|  t| d | d f\}}t| d | d f\}}|dv r2||||fS ||||fS )	Nc                 s   s    | ]	}t |tjV  qd S r   )r\   numbersNumberrc   xrX   rX   rY   	<genexpr>   s    z!_normalize_box.<locals>.<genexpr>z0Bounding box contains non-number coordinate(s): r   rN   r      )Z   i  )allr+   sorted)r   r   r5   r6   r7   r8   rX   rX   rY   _normalize_box   s   r   	mb_heightc                 C   s    | \}}}}||| ||| fS r   rX   )r   r   r5   r7   r6   r8   rX   rX   rY   _invert_box   s   r   c                   @   s  e Zd ZU ejdg Zee ed< dZe	ed< dZ
	dgddd	ed
edefddZdhddZedefddZedefddZedeeeef  fddZedefddZedefddZedefddZedeeef fddZdeeef deeef fd d!Zd"edefd#d$Z d%ee! de"eddf fd&d'Z#deeef fd(d)Z$	did*e%e& de'fd+d,Z(	did*e%e& dee) fd-d.Z*	did*e%e& de%e) fd/d0Z+	did*e%e& deeee%e    fd1d2Z,	did*e%e& de%eee%e    fd3d4Z-d5ede.fd6d7Z/					djd8e0ee1e f d9e	d:e	d;ed<e	d=e	d5edeeeef  fd>d?Z2d5edefd@dAZ3d5edefdBdCZ4d5edefdDdEZ5	dkdFe	d<e	d5edefdGdHZ6	dldJe7dKe	dLe	ddMfdNdOZ8	dldJe7dKe	dLe	ddMfdPdQZ9	dldJe7dKe	dLe	ddMfdRdSZ:dTe;ege	f ddUfdVdWZ<d5eddUfdXdYZ=				I	IdmdZe%e0ee>f  d[e%e0ee>f  d\e%e0ee>f  d]e	d^e	dd_fd`daZ?didbe%ee  deeef fdcddZ@defdedfZAdS )nPage_layoutcached_propertiesTis_originalNr   pdfrJ   page_objpage_numberinitial_doctopc           	         s   || _ | | _ | _|| _|| _ddtdtdtf fdd}|dd}|d | _t|d	| j}|d
 |d  }t	||| _
d jv rOt	t|d| j|| _n| j
| _| j
| _t | j| _d S )Nrd   defaultrK   c                    s    t  j| }|d u r|S |S r   )r)   attrsrR   )rd   r   re   r   rX   rY   get_attr   s   zPage.__init__.<locals>.get_attrRotater   ih  MediaBoxr   r   CropBoxr   )r   	root_pager   r   r   rS   r   r   r   r   mediaboxr   cropboxbboxr   _get_textmapget_textmap)	ro   r   r   r   r   r   	_rotationmb_rawr   rX   r   rY   __init__   s$   


zPage.__init__rK   c                 C   s   |    | j  d S r   )flush_cacher   cache_clearrq   rX   rX   rY   close   s   z
Page.closec                 C      | j d | j d  S )NrN   r   r   rq   rX   rX   rY   r4        z
Page.widthc                 C   r   )Nr   r   r   rq   rX   rX   rY   r/     r   zPage.heightc                 C   s0   zdd t | j| D W S  ty   g  Y S w )z-Return the structure tree for a page, if any.c                 S   s   g | ]}|  qS rX   )to_dict)rc   elemrX   rX   rY   
<listcomp>  s    z'Page.structure_tree.<locals>.<listcomp>)r"   r   r#   rq   rX   rX   rY   structure_tree  s
   zPage.structure_treec              
   C   sx   t | dr| jS t| jj| j| jjd}t| jj|}z|| j	 W n t
y3 } zt|d }~ww | | _| jS )Nr   )pagenolaparams)hasattrr   ri   r   rsrcmgrr   r   r   process_pager   	Exceptionr,   
get_result)ro   deviceinterpretererX   rX   rY   layout  s    

zPage.layoutc                    sx   dt ttf dtdt ttf ffdd dtdtf fdd}tjjp(g }tt||}t	t
r:|S |S )	NptrrK   c                    sF   |d }t |D ]}| \}}||d kr jn j}||| f} q| S )Nr   rN   )ranger4   r/   )r   r   turnsir   ycomprq   rX   rY   rotate_point&  s   z!Page.annots.<locals>.rotate_pointannotc                    sb  | d \}}}} ||fj } ||fj }jj}ttg ||R |\}}	}
}| di }|d| d| dd}| D ]>\}}|d urz	|d||< W qE ty   z	|d||< W n ty   j	j
rr td	| d
| d Y nw Y qEw qEjd||| |
||	 j|	 |	||
| ||	 d}|| d| v r| d< | |d< |S )NRectAURITContents)urititlecontentszutf-8zutf-16zCould not decode z of annotation. z will be missing.r   )r   object_typer5   r7   r6   r8   doctoptopbottomr4   r/   Pdata)r   r   r/   r   r   rR   rg   decodeUnicodeDecodeErrorr   raise_unicode_errorsr   r   r   update)r   _a_b_c_dpt0pt1rhr5   r   r6   r   aextraskvparsedr   ro   rX   rY   parse.  s\    
zPage.annots.<locals>.parse)r   r   r   r   r)   r   annotsr_   mapr\   CroppedPage_crop_fn)ro   r   rawr   rX   r   rY   r   $  s   *1

zPage.annotsc                 C   s   dd | j D S )Nc                 S   s   g | ]
}|d  dur|qS )r   NrX   )rc   r   rX   rX   rY   r   h  s    z#Page.hyperlinks.<locals>.<listcomp>)r   rq   rX   rX   rY   
hyperlinksf  s   zPage.hyperlinksc                 C   s    t | dr| jS |  | _| jS )N_objects)r   r   parse_objectsrq   rX   rX   rY   objectsj  s   

zPage.objectsr   c                 C   s*   | j d |d  | j d | j |d  fS )Nr   r   )r   r/   )ro   r   rX   rX   rY   point2coordq  s   *zPage.point2coordobjc                    s(  t td|jj }dtttf dt	tttf  fdd}t
td t||j }||d<  j|d< dD ]}t||rGtt||j||< q6d	D ]\}}||v r^t|| \||< ||< qJt|ttfr{| } jjd urwt jj|n||d
< t|tr|j}	t|	j\|d< |d< t|	j\|d< |d< t|d trt |d |d< n#t|t!frt"t j#|d |d<  fdd|j$D |d< |j%|d<  j&d d \}
}d|v r j'|d  | |d<  j'|d  | |d<  j(|d  |d< d|v r|
dkr|d |
 |d< |d |
 |d< |S )N itemrK   c                 S   s$   | \}}|t v rt|}||fS d S r   )	ALL_ATTRSr)   )r   r   r   resrX   rX   rY   process_attrx  s
   z)Page.process_object.<locals>.process_attrr   r   )ncsscs))rD   stroking_pattern)rB   non_stroking_patternr=   rD   r  rB   r  r<   r1   c                    s$   g | ]^}}|gt  j|R qS rX   )r   r   )rc   cmdr1   rq   rX   rY   r     s   $ z'Page.process_object.<locals>.<listcomp>pathdashrN   r7   r8   r   r   r   r5   r   r6   ))resublt_patr{   r   lowerr   rS   r   r	   rn   filterr   __dict__rg   r   r   r*   getattrrF   ra   r\   r   r   get_textr   unicode_normnormalize_unicodegraphicstatescolorncolorbytesrZ   r   r_   r   original_pathdashing_styler   r/   r   )ro   r   kindr   attrcs
color_attrpattern_attrr=   gsmb_x0mb_toprX   rq   rY   process_objectu  sV   &



zPage.process_objectlayout_objectsc                 c   sR    |D ]#}t |tr | jjd ur| |V  | |jE d H  q| |V  qd S r   )r\   r   r   r   r  iter_layout_objectsrt   )ro   r  r   rX   rX   rY   r     s   
zPage.iter_layout_objectsc                 C   sR   i }|  | jjD ]}|d }|dv rq	||d u rg ||< || | q	|S )Nr   )anno)r   r   rt   rR   append)ro   r   r   r  rX   rX   rY   r     s   zPage.parse_objectstable_settingsc                 C   s   t |}t| |S r   )r'   resolver&   ro   r#  tsetrX   rX   rY   debug_tablefinder  s   

zPage.debug_tablefinderc                 C   s   t |}t| |jS r   )r'   r$  r&   tablesr%  rX   rX   rY   find_tables  s   
zPage.find_tablesc                 C   sX   t |}| |}t|dkrd S dtdttttf fdd}tt	||dd }|S )Nr   r   rK   c                 S   s   t | j | jd | jd fS )Nr   r   )lencellsr   r   rX   rX   rY   sorter  s   zPage.find_table.<locals>.sorter)rd   )
r'   r$  r)  r*  r%   r   r   r   r_   r   )ro   r#  r&  r(  r-  largestrX   rX   rY   
find_table  s   

zPage.find_tablec                    s&   t | |  } fdd|D S )Nc                    s"   g | ]}|j d i  jpi qS )rX   )extracttext_settings)rc   tabler&  rX   rY   r     s   " z'Page.extract_tables.<locals>.<listcomp>)r'   r$  r)  )ro   r#  r(  rX   r3  rY   extract_tables  s   

zPage.extract_tablesc                 C   s6   t |}| |}|d u rd S |jdi |jpi S NrX   )r'   r$  r/  r0  r1  )ro   r#  r&  r2  rX   rX   rY   extract_table  s
   

zPage.extract_tablerb   c                 K   s\   t | jd}d|vr|d| ji d|vr|d| ji i ||}tj| jfi |S )N)layout_bboxlayout_width_charslayout_widthlayout_height_charslayout_height)rn   r   r   r4   r/   r   chars_to_textmapchars)ro   rb   defaultsfull_kwargsrX   rX   rY   r     s   zPage._get_textmappatternregexcase
main_groupreturn_charsreturn_groupsc           	      K   s*   | j di t|}|j||||||dS )N)rA  rB  rC  rD  rE  rX   )r   rh   search)	ro   r@  rA  rB  rC  rD  rE  rb   textmaprX   rX   rY   rF    s   
zPage.searchc                 K   s   | j di t|jS r5  )r   rh   	as_stringro   rb   rX   rX   rY   extract_text,  s   zPage.extract_textc                 K      t j| jfi |S r   )r   extract_text_simpler=  rI  rX   rX   rY   rL  /     zPage.extract_text_simplec                 K   rK  r   )r   extract_wordsr=  rI  rX   rX   rY   rN  2  rM  zPage.extract_wordsstripc                 K   s   | j di t|j||dS )N)rO  rD  rX   )r   rh   extract_text_lines)ro   rO  rD  rb   rX   rX   rY   rP  5  s   zPage.extract_text_linesFr   relativestrictr   c                 C   s   t | |||dS )N)rQ  rR  )r   ro   r   rQ  rR  rX   rX   rY   crop<  s   z	Page.cropc                 C      t | |||tjdS zS
        Same as .crop, except only includes objects fully within the bbox
        )rQ  rR  crop_fn)r   r   within_bboxrS  rX   rX   rY   rX  A     zPage.within_bboxc                 C   rU  rV  )r   r   outside_bboxrS  rX   rX   rY   rZ  K  rY  zPage.outside_bboxtest_functionFilteredPagec                 C   s
   t | |S r   )r\  )ro   r[  rX   rX   rY   r
  U     
zPage.filterc                 K   sB   t | dd }dd | j D |_tj| jfi ||jd< |S )u   
        Removes duplicate chars — those sharing the same text and positioning
        (within `tolerance`) as other characters in the set. Adjust extra_args
        to be more/less restrictive with the properties checked.
        c                 S   s   dS )NTrX   r,  rX   rX   rY   <lambda>^  s    z#Page.dedupe_chars.<locals>.<lambda>c                 S   s   i | ]\}}||qS rX   rX   )rc   r  objsrX   rX   rY   rf   _  s    z%Page.dedupe_chars.<locals>.<dictcomp>char)r\  r   rg   r   r   dedupe_charsr=  )ro   rb   prX   rX   rY   ra  X  s   zPage.dedupe_chars
resolutionr4   r/   	antialiasforce_mediaboxrI   c           	      C   s   ddl m}m} tdd |||fD }|dkrtd| |dur+d| | j }n|dur6d| | j }|| |p;|||dS )	z
        You can pass a maximum of 1 of the following:
        - resolution: The desired number pixels per inch. Defaults to 72.
        - width: The desired image width in pixels.
        - height: The desired image width in pixels.
        r   )DEFAULT_RESOLUTIONrI   c                 s   s    | ]}|d uV  qd S r   rX   r   rX   rX   rY   r   s  s    z Page.to_image.<locals>.<genexpr>zUOnly one of these arguments can be provided: resolution, width, height. You provided NH   )rc  rd  re  )displayrf  rI   sum
ValueErrorr4   r/   )	ro   rc  r4   r/   rd  re  rf  rI   	num_specsrX   rX   rY   to_imagec  s    zPage.to_imageobject_typesc              	   C   sl   |d u rt | j dg }n|}| j| j| j| j| j| j| j	| j
d}|D ]}t| |d ||d < q&|S )Nr   )r   r   r   r   r   r   r4   r/   s)r_   r   keysr   r   r   r   r   r   r4   r/   r  )ro   rm  _object_typesdtrX   rX   rY   r     s   
zPage.to_dictc                 C   s   d| j  dS )Nz<Page:>)r   rq   rX   rX   rY   __repr__  s   zPage.__repr__r   r   r   )TTr   TT)TT)FT)NNNFF)Br   r   r   r!   r   r   rS   r   r   boolpagesr   r   r   r   r   propertyr4   r/   r   r   r   r   r   r    r   r   r   r   r   r   r   r  r   r   r   r   r	   r$   r&   r'  r%   r)  r/  r4  r6  r-   r   r   r
   rF  rJ  rL  rN  rP  r   rT  rX  rZ  r   r
  ra  r   rl  r   rt  rX   rX   rX   rY   r      s6  
 

)A"Q






	






$!r   c                   @   s(   e Zd ZU dZeed< defddZdS )DerivedPageFr   parent_pagec                 C   sd   || _ |j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _| 	t
j t | j| _d S r   )rz  r   r   r   r   r   r   r   r   r   r!   r   r   r   r   )ro   rz  rX   rX   rY   r     s   zDerivedPage.__init__N)r   r   r   r   rv  r   r   r   rX   rX   rX   rY   ry    s   
 ry  r   parent_bboxc                 C   st   t | }|dkrtd|  dt | |}|d u r%td|  d| t |}||k r8td|  d| d S )Nr   zBounding box z has an area of zero.z. is entirely outside parent page bounding box z. is not fully within parent page bounding box )r   calculate_arearj  get_bbox_overlap)r   r{  	bbox_areaoverlapoverlap_arearX   rX   rY   test_proposed_bbox  s$   

r  c                       sb   e Zd Zejddfdededeeegef de	de	f
 fdd	Z
ed
eeef fddZ  ZS )r   FTrz  	crop_bboxrW  rQ  rR  c                    s   |r|j \}}}} \}	}
}}|	| |
| || || f |r%t |j  dtdtf fdd}t | || _tju rE|j | _ d S  | _ d S )Nr_  rK   c                    s
   |  S r   rX   )r_  r  rW  rX   rY   r     r]  z&CroppedPage.__init__.<locals>._crop_fn)r   r  r    rw   r   r   r   rZ  )ro   rz  r  rW  rQ  rR  o_x0o_top_r5   r   r6   r   r   rz   r  rY   r     s   

zCroppedPage.__init__rK   c                    2   t  dr jS  fdd jj D  _ jS )Nr   c                    s   i | ]
\}}|  |qS rX   )r   rc   r   r   rq   rX   rY   rf     s    z'CroppedPage.objects.<locals>.<dictcomp>r   r   rz  r   rg   rq   rX   rq   rY   r     s   


zCroppedPage.objects)r   r   r   r   crop_to_bboxr   r   r   r    rv  r   rx  r   rS   r   r   rX   rX   rz   rY   r     s"     r   c                       sJ   e Zd Zdedeegef f fddZede	e
ef fddZ  ZS )r\  rz  	filter_fnc                    s   |j | _ || _t | d S r   )r   r  rw   r   )ro   rz  r  rz   rX   rY   r     s   zFilteredPage.__init__rK   c                    r  )Nr   c                    s"   i | ]\}}|t t j|qS rX   )r_   r
  r  r  rq   rX   rY   rf     s    z(FilteredPage.objects.<locals>.<dictcomp>r  rq   rX   rq   rY   r     s   


zFilteredPage.objects)r   r   r   r   r   r   rv  r   rx  r   rS   r    r   r   rX   rX   rz   rY   r\    s      r\  ru  )Wr   r  	functoolsr   typingr   r   r   r   r   r   r	   r
   r   r   unicodedatar   r  warningsr   pdfminer.converterr   pdfminer.layoutr   r   r   r   r   r   r   pdfminer.pdfinterpr   r   pdfminer.pdfpager   pdfminer.psparserr   r   r   _typingr   r   r   r    	containerr!   	structurer"   r#   r2  r$   r%   r&   r'   r(   r)   r*   utils.exceptionsr+   r,   
utils.textr-   compiler  setr   rh  rI   r   rJ   rQ   r  rS   rZ   r   r   r]   ra   rh   ri   r   r   r   ry  r  r   r\  rX   rX   rX   rY   <module>   sn    0$	
!	
"
	"
"3   M(