o
    AgQ,                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZ eeZ G dd	 d	eZ!G d
d deZ"dS )z:Pebblo's safe dataloader is a wrapper for document loaders    N)version)AnyDictIterableIteratorListOptional)Document)
BaseLoader)BATCH_SIZE_BYTESPLUGIN_VERSIONApp	FrameworkIndexedDocumentPebbloLoaderAPIWrappergenerate_size_based_batchesget_full_pathget_loader_full_pathget_loader_typeget_runtimeget_source_sizec                   @   s  e Zd ZU dZdZeed< 					d+dddded	ed
edede	e dede	e dedefddZ
dee fddZd,ddZdee fddZed,ddZdefddZdee fddZd edee fd!d"Zdee fd#d$Zd%ed&edefd'd(Zd eddfd)d*ZdS )-PebbloSafeLoaderzkPebblo Safe Loader class is a wrapper around document loaders enabling the data
    to be scrutinized.
    F_discover_sent Nlocal)classifier_locationanonymize_snippetslangchain_loadernameownerdescriptionapi_keyload_semanticclassifier_urlr   r   c                C   s   |rt |tstd|| _tt | _|| _tj	
dp|| _|| _|| _t| j| _g | _g | _tt| jdd dd }
t|
| _t| j| _t| _|
| j| jd| jdkredt| jini | _|  | _t||||	d	| _| j| j d S )
NzMust specify a valid name.PEBBLO_LOAD_SEMANTIC.'r   )loadersource_pathsource_typesource_path_size)r!   r   r#   r   ) 
isinstancestr	NameErrorapp_nameuuiduuid4load_idr(   osenvirongetr"   r   r    r   r)   docsdocs_with_idtypesplitr   r*   r   r+   r   
batch_sizeloader_details_get_app_detailsappr   	pb_clientsend_loader_discover)selfr   r   r   r    r!   r"   r#   r   r   loader_name rB   r/var/www/html/development/chatbot/venv/lib/python3.10/site-packages/langchain_community/document_loaders/pebblo.py__init__%   s>   "


zPebbloSafeLoader.__init__returnc                 C   s   | j  | _|   | jS )zxLoad Documents.

        Returns:
            list: Documents fetched from load method of the wrapped `loader`.
        )r(   loadr6   classify_in_batches)r@   rB   rB   rC   rF   V   s   zPebbloSafeLoader.loadc           	      C   s   t | j| j}g }t|}t|D ]6\}}||d k}|| _|  | _| jj| j| j	| j
|d}| | | jr>| |}n|  }|| q|| _dS )z
        Classify documents in batches.
        This is to avoid API timeouts when sending large number of documents.
        Batches are generated based on the page_content size.
           )loading_endN)r   r6   r:   len	enumerate_index_docsr7   r>   classify_documentsr=   r;   _add_pebblo_specific_metadatar"   _add_semantic_to_docs_unindex_docsextend)	r@   batchesprocessed_docstotal_batchesibatchis_last_batchclassified_docsbatch_processed_docsrB   rB   rC   rG   a   s*   


z$PebbloSafeLoader.classify_in_batchesc              
   c   s    z| j  }W n ty& } z| j jj d}t| t||d}~ww 	 zt|}W n ty;   g | _	Y dS w t
|f| _	|  | _| j| j| j| j}| | | jra| || _	n|  | _	| j	d V  q()zLoad documents in lazy fashion.

        Raises:
            NotImplementedError: raised when lazy_load id not implemented
            within wrapped loader.

        Yields:
            list: Documents from loader's lazy loading.
        z does not implement lazy_load()NTr   )r(   	lazy_loadNotImplementedError	__class____name__loggererrornextStopIterationr6   listrL   r7   r>   rM   r=   r;   rN   r"   rO   rP   )r@   doc_iteratorexcerr_strdocclassified_docrB   rB   rC   rZ      s6   





zPebbloSafeLoader.lazy_loadc                 C   s
   d| _ d S )NT)r   )clsrB   rB   rC   set_discover_sent   s   
z"PebbloSafeLoader.set_discover_sentc                 C   s:   t  \}}t| j| j| j| j||ttdtddd}|S )z\Fetch app details. Internal method.

        Returns:
            App: App details.
        langchain_community)r   r   )r   r   r    r2   runtime	frameworkplugin_versionclient_version)	r   r   r/   r   r    r2   r   r   r   )r@   rl   rk   r=   rB   rB   rC   r<      s   
z!PebbloSafeLoader._get_app_detailsc                 C      dd t | jD }|S )z
        Indexes the documents and returns a list of IndexedDocument objects.

        Returns:
            List[IndexedDocument]: A list of IndexedDocument objects with unique IDs.
        c                 S   s*   g | ]\}}t dd t|i| qS )pb_idrB   )r   r-   dict.0rU   rf   rB   rB   rC   
<listcomp>   s    z0PebbloSafeLoader._index_docs.<locals>.<listcomp>)rK   r6   )r@   r7   rB   rB   rC   rL         zPebbloSafeLoader._index_docsrX   c                 C   sV   dd | j D }| D ]}|d}||v r| || | qdd | D }|S )aF  
        Adds semantic metadata to the given list of documents.

        Args:
            classified_docs (Dict): A dictionary of dictionaries containing the
                classified documents with pb_id as key.

        Returns:
            List[Document]: A list of Document objects with added semantic metadata.
        c                 S   s    i | ]}|j t|j|jd qS )page_contentmetadata)rp   r	   rw   rx   rs   rf   rB   rB   rC   
<dictcomp>   s    z:PebbloSafeLoader._add_semantic_to_docs.<locals>.<dictcomp>rp   c                 S   s   g | ]}|qS rB   rB   ry   rB   rB   rC   rt      s    z:PebbloSafeLoader._add_semantic_to_docs.<locals>.<listcomp>)r7   valuesr5   _add_semantic_to_doc)r@   rX   indexed_docsrg   doc_idsemantic_metadata_docsrB   rB   rC   rO      s   
z&PebbloSafeLoader._add_semantic_to_docsc                 C   ro   )z
        Converts a list of IndexedDocument objects to a list of Document objects.

        Returns:
            List[Document]: A list of Document objects.
        c                 S   s    g | ]\}}t |j|jd qS rv   )r	   rw   rx   rr   rB   rB   rC   rt      s    z2PebbloSafeLoader._unindex_docs.<locals>.<listcomp>)rK   r7   )r@   r6   rB   rB   rC   rP      ru   zPebbloSafeLoader._unindex_docsrf   rg   c                 C   s8   t |di  |jd< t |di  |jd< |S )a4  
        Adds semantic metadata to the given document in-place.

        Args:
            doc (Document): A Document object.
            classified_doc (dict): A dictionary containing the classified document.

        Returns:
            Document: The Document object with added semantic metadata.
        entitiespebblo_semantic_entitiestopicspebblo_semantic_topics)rb   r5   keysrx   )r@   rf   rg   rB   rB   rC   r|      s   

z%PebbloSafeLoader._add_semantic_to_docc              	   C   st   | j D ]4}|j}| jjjdkrt|d| j|d< nt|d|d| j|d< ||ji dd|d< qdS )z*Add Pebblo specific metadata to documents.SharePointLoadersource	full_pathpb_checksumN)	r7   rx   r(   r\   r]   r   r5   r)   rp   )r@   rX   rf   doc_metadatarB   rB   rC   rN     s   


z.PebbloSafeLoader._add_pebblo_specific_metadata)r   r   NFN)rE   N)r]   
__module____qualname____doc__r   bool__annotations__r
   r-   r   rD   r   r	   rF   rG   r   rZ   classmethodri   r   r<   r   rL   r   rO   rP   rq   r|   rN   rB   rB   rB   rC   r      sR   
 


1
 "r   c                   @   s   e Zd ZdZddddddee dee deee  deeee	f  deeeee	f   d	dfd
dZ
d	ee fddZd	ee fddZdS )PebbloTextLoaderz
    Loader for text data.

    Since PebbloSafeLoader is a wrapper around document loaders, this loader is
    used to load text data directly into Documents.
    N)r   idsrx   	metadatastextsr   r   rx   r   rE   c                C   s"   || _ || _|| _|| _|| _dS )a  
        Args:
            texts: Iterable of text data.
            source: Source of the text data.
                Optional. Defaults to None.
            ids: List of unique identifiers for each text.
                Optional. Defaults to None.
            metadata: Metadata for all texts.
                Optional. Defaults to None.
            metadatas: List of metadata for each text.
                Optional. Defaults to None.
        N)r   r   r   rx   r   )r@   r   r   r   rx   r   rB   rB   rC   rD     s
   
zPebbloTextLoader.__init__c                 c   s    t | jD ]9\}}d}| jpi }| jr(|t| jk r(| j| r(|| j|  | jr7|t| jk r7| j| }t|||dV  qdS )zi
        Lazy load text data into Documents.

        Returns:
            Iterator of Documents
        N)idrw   rx   )rK   r   rx   r   rJ   updater   r	   )r@   rU   text_idrx   rB   rB   rC   rZ   9  s   

zPebbloTextLoader.lazy_loadc                 C   s    g }|   D ]}|| q|S )z`
        Load text data into Documents.

        Returns:
            List of Documents
        )rZ   append)r@   	documentsrf   rB   rB   rC   rF   I  s   zPebbloTextLoader.load)r]   r   r   r   r   r-   r   r   r   r   rD   r   r	   rZ   rF   rB   rB   rB   rC   r     s*    

r   )#r   loggingr3   r0   importlib.metadatar   typingr   r   r   r   r   r   langchain_core.documentsr	   )langchain_community.document_loaders.baser
   $langchain_community.utilities.pebblor   r   r   r   r   r   r   r   r   r   r   r   	getLoggerr]   r^   r   r   rB   rB   rB   rC   <module>   s     8
 y