o
    AgQ                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZmZm	Z	m
Z
mZ d dlmZ d dlmZ G dd deeZG dd	 d	eZG d
d deZdS )    N)ABCabstractmethod)DictIteratorOptionalTupleUnion)Document)
BaseLoaderc                !   @   s2  e Zd ZdZdddddddd	d	dddd
d
ddedededeeef dededededededeeef deeef deeef dee dee dd
f ddZ	de
e fddZedefd d!Zd"edefd#d$Zd%ed&ede
e fd'd(Z	
d/d%eded)ee de
e fd*d+Zd,edeeef fd-d.Zd
S )0DedocBaseLoadera  
    Base Loader that uses `dedoc` (https://dedoc.readthedocs.io).

    Loader enables extracting text, tables and attached files from the given file:
        * `Text` can be split by pages, `dedoc` tree nodes, textual lines
            (according to the `split` parameter).
        * `Attached files` (when with_attachments=True)
            are split according to the `split` parameter.
            For attachments, langchain Document object has an additional metadata field
            `type`="attachment".
        * `Tables` (when with_tables=True) are not split - each table corresponds to one
            langchain Document object.
            For tables, Document object has additional metadata fields `type`="table"
            and `text_as_html` with table HTML representation.
    documentTF
   
auto_tabbyrus+eng:autoN)splitwith_tableswith_attachmentsrecursion_deep_attachmentspdf_with_text_layerlanguagepagesis_one_column_documentdocument_orientationneed_header_footer_analysisneed_binarizationneed_pdf_table_analysis	delimiterencoding	file_pathr   r   r   r   r   r   r   r   r   r   r   r   r   r   returnc                C   s~   dd t   D | _h d| _|| jvr!td| d| j d|| _|| _|| _| jdkr1dnd	}|| jd
< || jd< dS )a
  
        Initialize with file path and parsing parameters.

        Args:
            file_path: path to the file for processing
            split: type of document splitting into parts (each part is returned
                separately), default value "document"
                "document": document text is returned as a single langchain Document
                    object (don't split)
                "page": split document text into pages (works for PDF, DJVU, PPTX, PPT,
                    ODP)
                "node": split document text into tree nodes (title nodes, list item
                    nodes, raw text nodes)
                "line": split document text into lines
            with_tables: add tables to the result - each table is returned as a single
                langchain Document object

            Parameters used for document parsing via `dedoc`
                (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):

                with_attachments: enable attached files extraction
                recursion_deep_attachments: recursion level for attached files
                    extraction, works only when with_attachments==True
                pdf_with_text_layer: type of handler for parsing PDF documents,
                    available options
                    ["true", "false", "tabby", "auto", "auto_tabby" (default)]
                language: language of the document for PDF without a textual layer and
                    images, available options ["eng", "rus", "rus+eng" (default)],
                    the list of languages can be extended, please see
                    https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
                pages: page slice to define the reading range for parsing PDF documents
                is_one_column_document: detect number of columns for PDF without
                    a textual layer and images, available options
                    ["true", "false", "auto" (default)]
                document_orientation: fix document orientation (90, 180, 270 degrees)
                    for PDF without a textual layer and images, available options
                    ["auto" (default), "no_change"]
                need_header_footer_analysis: remove headers and footers from the output
                    result for parsing PDF and images
                need_binarization: clean pages background (binarize) for PDF without a
                    textual layer and images
                need_pdf_table_analysis: parse tables for PDF without a textual layer
                    and images
                delimiter: column separator for CSV, TSV files
                encoding: encoding of TXT, CSV, TSV
        c                 S   s   i | ]\}}|d vr||qS )>   selfr   r    r    ).0keyvaluer#   r#   q/var/www/html/development/chatbot/venv/lib/python3.10/site-packages/langchain_community/document_loaders/dedoc.py
<dictcomp>d   s
    z,DedocBaseLoader.__init__.<locals>.<dictcomp>>   linenodepager   Got $ for `split`, but should be one of ``r*   treelinearstructure_typeneed_content_analysisN)localsitemsparsing_parametersvalid_split_values
ValueErrorr   r   r    )r"   r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   r1   r#   r#   r'   __init__#   s    A


zDedocBaseLoader.__init__c                 c   s    ddl }zddlm} W n ty   tdw ||  d}d|jd _| }|j| j	i | j
d|id	}W d   n1 sDw   Y  | j|  | jd
E dH  dS )Lazily load documents.r   N)DedocManagerzE`dedoc` package not found, please install it with `pip install dedoc`)manager_configTloggerattachments_dir)r    
parametersdocument_treer   )tempfilededocr:   ImportError_make_configconfigdisabledTemporaryDirectoryparser    r5   _split_documentto_api_schemadictr   )r"   rA   r:   dedoc_managertmpdirr@   r#   r#   r'   	lazy_loadw   s(   
zDedocBaseLoader.lazy_loadc                 C   s   dS )zu
        Make configuration for DedocManager according to the file extension and
        parsing parameters.
        Nr#   r"   r#   r#   r'   rD      s   zDedocBaseLoader._make_config	paragraphc                    sB   d  fdd|d D }|r|d  d| }|S |d }|S )z1Get text (recursively) of the document tree node.
c                    s   g | ]}  |qS r#   )	_json2txt)r$   subparagraphrO   r#   r'   
<listcomp>   s    z-DedocBaseLoader._json2txt.<locals>.<listcomp>subparagraphstext)join)r"   rP   subparagraphs_textrV   r#   rO   r'   rR      s   
zDedocBaseLoader._json2txtr@   document_metadatac                 c   sZ    t |d dkr|d D ]}| j||dE dH  qdS t|d i ||d dV  dS )z4Parse recursively document tree obtained by `dedoc`.rU   r   r@   rY   NrV   metadatapage_contentr[   )len_parse_subparagraphsr	   )r"   r@   rY   rS   r#   r#   r'   r_      s   z$DedocBaseLoader._parse_subparagraphsadditional_metadatac                 c   s   |d }|ri ||}|dkr#| j |d d d}t||dV  n|dkrt|d d d }|d	 d d
 }d}|D ]*}	|	d d
 |krM||  |	7 }q;t|i |d
|idV  |	d d
 }|  |	}q;t|i |d
|idV  nA|dkr|d d d D ]}	|	d }
t|  |	i ||
dV  qn|dkr| j|d d |dE dH  ntd| d| j d| jr|d d D ]}| |\}}t|i |d d|ddV  q|d D ]}| j|| jddidE dH  qdS )z=Split document into parts according to the `split` parameter.r[   r   content	structure)rP   r\   r+   rU   r   page_id r)   r*   rZ   Nr,   r-   r.   tablestable)typetext_as_htmlattachmentsrg   
attachment)r@   r   r`   )	rR   r	   r_   r7   r6   r   
_get_tablerI   r   )r"   r@   r   r`   rY   rV   nodesrc   	page_textr*   line_metadatarf   
table_text
table_htmlrj   r#   r#   r'   rI      sz   




	zDedocBaseLoader._split_documentrf   c              
   C   s   d}|d D ]}|D ]}|d dd |d D 7 }|d7 }q
|d7 }qd	}|d D ]>}|d
7 }|D ]1}d dd |d D }t|}|d7 }|d rQ|d7 }|d|d  d|d  d| d7 }q2|d7 }q*|d7 }||fS )z.Get text and HTML representation of the table.rd   cells c                 s       | ]}|d  V  qdS rV   Nr#   r$   r)   r#   r#   r'   	<genexpr>      z-DedocBaseLoader._get_table.<locals>.<genexpr>lines	rQ   zK<table border="1" style="border-collapse: collapse; width: 100%;">
<tbody>
z<tr>
c                 s   rs   rt   r#   ru   r#   r#   r'   rv     rw   z<td	invisiblez style="display: none" z
 colspan="colspanz" rowspan="rowspanz">z</td>
z</tr>
z</tbody>
</table>)rW   htmlescape)r"   rf   ro   rowcellrp   	cell_textr#   r#   r'   rk      s4   



zDedocBaseLoader._get_tableN)__name__
__module____qualname____doc__strboolr   intr   r8   r   r	   rN   r   rK   rD   rR   r_   rI   r   rk   r#   r#   r#   r'   r      s    
	




T

Kr   c                   @   s   e Zd ZdZdefddZdS )DedocFileLoaderaw  
    DedocFileLoader document loader integration to load files using `dedoc`.

    The file loader automatically detects the file type (with the correct extension).
    The list of supported file types is gives at
    https://dedoc.readthedocs.io/en/latest/index.html#id1.
    Please see the documentation of DedocBaseLoader to get more details.

    Setup:
        Install ``dedoc`` package.

        .. code-block:: bash

            pip install -U dedoc

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import DedocFileLoader

            loader = DedocFileLoader(
                file_path="example.pdf",
                # split=...,
                # with_tables=...,
                # pdf_with_text_layer=...,
                # pages=...,
                # ...
            )

    Load:
        .. code-block:: python

            docs = loader.load()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }
    r!   c                 C   s    ddl m} || j| j| jdS )Nr   )make_manager_config)r    parsing_paramsr   )dedoc.utils.langchainr   r    r5   r   )r"   r   r#   r#   r'   rD   `  s   zDedocFileLoader._make_configN)r   r   r   r   rK   rD   r#   r#   r#   r'   r     s    Br   c                #       s   e Zd ZdZdddddddd	d
d
dddddddededededeeef dededededededeeef deeef deeef dee dee ddf" fddZ	de
e fd d!Zdefd"d#Zdeded$edeeeeeef f fd%d&Z  ZS )'DedocAPIFileLoaderaU  
    Load files using `dedoc` API.
    The file loader automatically detects the file type (even with the wrong extension).
    By default, the loader makes a call to the locally hosted `dedoc` API.
    More information about `dedoc` API can be found in `dedoc` documentation:
        https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api.html

    Please see the documentation of DedocBaseLoader to get more details.

    Setup:
        You don't need to install `dedoc` library for using this loader.
        Instead, the `dedoc` API needs to be run.
        You may use Docker container for this purpose.
        Please see `dedoc` documentation for more details:
            https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker

        .. code-block:: bash

            docker pull dedocproject/dedoc
            docker run -p 1231:1231

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import DedocAPIFileLoader

            loader = DedocAPIFileLoader(
                file_path="example.pdf",
                # url=...,
                # split=...,
                # with_tables=...,
                # pdf_with_text_layer=...,
                # pages=...,
                # ...
            )

    Load:
        .. code-block:: python

            docs = loader.load()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }
    zhttp://0.0.0.0:1231r   TFr   r   r   r   r   N)urlr   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r!   c                   s>   t  j||||||||	|
||||||d || _d| jd< dS )a
  Initialize with file path, API url and parsing parameters.

        Args:
            file_path: path to the file for processing
            url: URL to call `dedoc` API
            split: type of document splitting into parts (each part is returned
                separately), default value "document"
                "document": document is returned as a single langchain Document object
                    (don't split)
                "page": split document into pages (works for PDF, DJVU, PPTX, PPT, ODP)
                "node": split document into tree nodes (title nodes, list item nodes,
                    raw text nodes)
                "line": split document into lines
            with_tables: add tables to the result - each table is returned as a single
                langchain Document object

            Parameters used for document parsing via `dedoc`
                (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):

                with_attachments: enable attached files extraction
                recursion_deep_attachments: recursion level for attached files
                    extraction, works only when with_attachments==True
                pdf_with_text_layer: type of handler for parsing PDF documents,
                    available options
                    ["true", "false", "tabby", "auto", "auto_tabby" (default)]
                language: language of the document for PDF without a textual layer and
                    images, available options ["eng", "rus", "rus+eng" (default)],
                    the list of languages can be extended, please see
                    https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
                pages: page slice to define the reading range for parsing PDF documents
                is_one_column_document: detect number of columns for PDF without
                    a textual layer and images, available options
                    ["true", "false", "auto" (default)]
                document_orientation: fix document orientation (90, 180, 270 degrees)
                    for PDF without a textual layer and images, available options
                    ["auto" (default), "no_change"]
                need_header_footer_analysis: remove headers and footers from the output
                    result for parsing PDF and images
                need_binarization: clean pages background (binarize) for PDF without a
                    textual layer and images
                need_pdf_table_analysis: parse tables for PDF without a textual layer
                    and images
                delimiter: column separator for CSV, TSV files
                encoding: encoding of TXT, CSV, TSV
        )r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   jsonreturn_formatN)superr8   r   r5   )r"   r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	__class__r#   r'   r8     s&   AzDedocAPIFileLoader.__init__c                 c   s2    | j | j| j| jd}| j|| jdE dH  dS )r9   )r   r    r>   r?   N)
_send_filer   r    r5   rI   r   )r"   doc_treer#   r#   r'   rN   	  s
   zDedocAPIFileLoader.lazy_loadc                 C   s   i S r   r#   rO   r#   r#   r'   rD     s   zDedocAPIFileLoader._make_configr>   c           
      C   s   ddl }tj|}t|d}d||fi}|j| d||d}W d   n1 s+w   Y  |jdkr?td|j	  t
|j	 }	|	S )	z7Send POST-request to `dedoc` API and return the resultsr   Nrbfilez/upload)filesdata   zError during file handling: )requestsospathbasenameopenpoststatus_coder7   ra   decoder   loads)
r"   r   r    r>   r   	file_namer   r   rresultr#   r#   r'   r     s   
zDedocAPIFileLoader._send_file)r   r   r   r   r   r   r   r   r   r8   r   r	   rN   rK   rD   r   listr   __classcell__r#   r#   r   r'   r   j  s~    M
	



Ur   )r}   r   r   abcr   r   typingr   r   r   r   r   langchain_core.documentsr	   )langchain_community.document_loaders.baser
   r   r   r   r#   r#   r#   r'   <module>   s      M