o
    Jjg                     @  s   d Z ddlmZ ddlZddlZddlZddlZddlZddl	Z	ddl
m
Z
mZ ddlmZmZmZmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lm Z m!Z! ddl"m#Z#m$Z$m%Z% ddl"m&Z' ddl"m(Z) ddl*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5m6Z6 ddl4m7Z8 ddl9m:Z:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@mAZA ddlBmCZC ddlDmEZE ddlFmGZG ddlHmIZI ddlJmKZK ddlLmMZMmNZNmOZO ddlPmQZR ddlSm&ZT ddlSmUZUmVZV erddlWZXeYeZZ[eeg eeIe#f f eee\gef e#eIf Z]eeg eeIe#f f ef Z^G dd de_Z`G d d! d!e\ZaG d"d# d#e\Zb	$ddd+d,Zcdd/d0ZdG d1d2 d2eGZedd4d5Zfdd;d<Zgdd?d@ZhddBdCZiddKdLZjddRdSZkddUdVZlddXdYZmddcddZnddfdgZoddidjZpdddddkddtduZqddddvddydzZrdd{dd~dZsddddvdddZtddddvdddZudd{dddZv			ddddZwG dd deGddZxejyG dd dZzdddZ{dddZ|dZ}dddddddddddZ~dddddddddddZdZee_ edde~_ dS )z>Utilities for running language models or Chains over datasets.    )annotationsN)datetimetimezone)	TYPE_CHECKINGAnyCallableDictListOptionalTupleUnioncast)warn_deprecated)	Callbacks)BaseLanguageModel)BaseMessagemessages_from_dict)
ChatResult	LLMResult)RunnableRunnableConfigRunnableLambdaconfig)utils)EvaluatorCallbackHandlerwait_for_all_evaluators)LangChainTracer)Client)get_git_infoget_langchain_env_var_metadata)EvaluationResultRunEvaluator)run_evaluator)as_runnableis_traceable_function)DatasetDataTypeExampleRunTracerSession)LangSmithError)	HTTPError)	TypedDict)Chain)load_evaluator)EvaluatorTypePairwiseStringEvaluatorStringEvaluator)
evaluation)name_generationprogressc                   @  s   e Zd ZdZdS )InputFormatErrorz(Raised when the input format is invalid.N)__name__
__module____qualname____doc__ r;   r;   _/var/www/html/zoom/venv/lib/python3.10/site-packages/langchain/smith/evaluation/runner_utils.pyr6   N   s    r6   c                   @  s$   e Zd ZdZd	ddZd	ddZdS )

TestResultz1A dictionary of the results of a single test run.returnpd.DataFramec                 C  s.   |   }dd |jD }|jddj|ddS )zReturn quantiles for the feedback scores.

        This method calculates and prints the quantiles for the feedback scores
        across all feedback keys.

        Returns:
            A DataFrame containing the quantiles for each feedback key.
        c                 S  s6   g | ]}| d s| ds|dv s| dr|qS )inputs.outputs.>   inputoutput	reference)
startswith).0colr;   r;   r<   
<listcomp>e   s    z5TestResult.get_aggregate_feedback.<locals>.<listcomp>all)include   )axis)to_dataframecolumnsdescribedrop)selfdfto_dropr;   r;   r<   get_aggregate_feedbackX   s
   z!TestResult.get_aggregate_feedbackc              
   C  sH  zddl }W n ty } ztd|d}~ww g }g }| d  D ]z\}}|d }|d}t|tr>dd | D }	n|du rEi }	nd|i}	i d	d |d
  D |	}
d|v rxt|d trr|
dd |d  D  n|d |
d< |
i dd |D |d|d |dd ||
 || q"|j||dS )z#Convert the results to a dataframe.r   NzfPandas is required to convert the results to a dataframe. to install pandas, run `pip install pandas`.resultsfeedbackrC   c                 S     i | ]
\}}d | |qS )rA   r;   rF   kvr;   r;   r<   
<dictcomp>       z+TestResult.to_dataframe.<locals>.<dictcomp>c                 S  rW   )r@   r;   rX   r;   r;   r<   r[      r\   rB   rD   c                 S  rW   )z
reference.r;   rX   r;   r;   r<   r[      r\   c                 S  s   i | ]
}d |j  |jqS )z	feedback.)keyscore)rF   fr;   r;   r<   r[      r\   Errorexecution_timerun_id)errorra   rb   )index)	pandasImportErroritemsget
isinstancedictupdateappend	DataFrame)rQ   pdeindicesrecords
example_idresultrV   output_rC   rr;   r;   r<   rM   o   sV   


zTestResult.to_dataframeN)r>   r?   )r7   r8   r9   r:   rT   rM   r;   r;   r;   r<   r=   U   s    
r=   c                      s,   e Zd ZdZd fdd	ZdddZ  ZS )	EvalErrorz"Your architecture raised an error.r`   BaseExceptionkwargsr   r>   Nonec                   s   t  jdd|i| d S )Nr`   r;   )super__init__)rQ   r`   rx   	__class__r;   r<   r{      s   zEvalError.__init__namestrc                 C  s*   z| | W S  t y   td| dw )Nz%'EvalError' object has no attribute '')KeyErrorAttributeError)rQ   r~   r;   r;   r<   __getattr__   s
   
zEvalError.__getattr__)r`   rw   rx   r   r>   ry   )r~   r   r>   r   )r7   r8   r9   r:   r{   r   __classcell__r;   r;   r|   r<   rv      s    rv   <my_dataset>llm_or_chain_factoryMODEL_OR_CHAIN_FACTORYdataset_namer   r>   MCFc                   sZ  t | tr)|   jj}| jdur# jjj}td| d| d| d fddS t | tr0| S t | tr=| fddS t| rt	| rRt
tt| fd	dS z|  }W n& ty}   tt| }t|}td
| d t|fdd Y S w tt| t |tr|S t	tt|rt
tt|fddS t |tsfddS S | S )zForgive the user if they pass in a chain without memory instead of a chain
    factory. It's a common mistake. Raise a more helpful error message as well.Na$  Cannot directly evaluate a chain with stateful memory. To evaluate this chain, pass in a chain constructor that initializes fresh memory each time it is called.  This will safegaurd against information leakage between dataset examples.
For example:

def chain_constructor():
    new_memory = z(...)
    return z*(memory=new_memory, ...)

run_on_dataset("z", chain_constructor, ...)c                         S Nr;   r;   )chainr;   r<   <lambda>       z(_wrap_in_chain_factory.<locals>.<lambda>c                     r   r   r;   r;   )lcfr;   r<   r      r   c                     r   r   r;   r;   	runnable_r;   r<   r      r   zWrapping function z as RunnableLambda.c                     r   r   r;   r;   )wrappedr;   r<   r      r   c                     r   r   r;   r;   r   r;   r<   r      r   c                     s   t  S r   )r   r;   )constructorr;   r<   r          )ri   r.   r}   r7   memory
ValueErrorr   r   callabler%   r$   r   r   	TypeErrorinspect	signatureloggerinfor   )r   r   chain_classmemory_class_model	user_funcsigr;   )r   r   r   r   r   r<   _wrap_in_chain_factory   sV   











r   inputsDict[str, Any]c                 C  s,  | st dg }d| v r%t| d tst dt| d j | d g}n]d| v rLt| d tr;tdd | d D sGt dt| d j | d }n6t| dkr{tt	| 
 }t|trc|g}nt|trttd	d |D rt|}nt d
|  t d|  t|dkr|d S t dt| d)zGet prompt from inputs.

    Args:
        inputs: The input dictionary.

    Returns:
        A string prompt.
    Raises:
        InputFormatError: If the input format is invalid.
    Inputs should not be empty.promptz"Expected string for 'prompt', got promptsc                 s      | ]}t |tV  qd S r   ri   r   rF   ir;   r;   r<   	<genexpr>       

z_get_prompt.<locals>.<genexpr>z,Expected list of strings for 'prompts', got rK   c                 s  r   r   r   r   r;   r;   r<   r         z)LLM Run expects string prompt input. Got z5LLM Run expects 'prompt' or 'prompts' in inputs. Got r   z)LLM Run expects single prompt input. Got z	 prompts.)r6   ri   r   typer7   listrI   lennextitervalues)r   r   prompt_r;   r;   r<   _get_prompt   sH   

r   c                   @  s   e Zd ZU dZded< dS )ChatModelInputzVInput for a chat model.

    Parameters:
        messages: List of chat messages.
    zList[BaseMessage]messagesNr7   r8   r9   r:   __annotations__r;   r;   r;   r<   r     s   
 r   rj   c                 C  s   | st d|  }d| v r|d|d< nt| dkr&tt|  |d< d|v rS|d }t|tr?t	dd |D r?|g}t|dkrOt
|d |d< |S t dt d	|  )
zGet Chat Messages from inputs.

    Args:
        inputs: The input dictionary.

    Returns:
        A list of chat messages.
    Raises:
        InputFormatError: If the input format is invalid.
    r   r   rB   rK   c                 s  r   r   )ri   rj   r   r;   r;   r<   r   9  r   z _get_messages.<locals>.<genexpr>r   zGBatch messages not supported. Please provide a single list of messages.zMChat Run expects single List[dict] or List[List[dict]] 'messages' input. Got )r6   copypopr   r   r   r   ri   r   rI   r   )r   
input_copyraw_messagesr;   r;   r<   _get_messages%  s0   r   first_exampler(   input_mapperOptional[Callable[[Dict], Any]]ry   c                 C  s   |r+|| j }t|ts't|trtdd |D s)td| dt| dd S d S zt| j  W d S  tyT   z	t| j  W Y d S  tyS   td| j  dw w )Nc                 s  r   r   ri   r   rF   msgr;   r;   r<   r   U  r   z>_validate_example_inputs_for_language_model.<locals>.<genexpr>zWhen using an input_mapper to prepare dataset example inputs for an LLM or chat model, the output must a single string or a list of chat messages.
Got: 	 of type .zvExample inputs do not match language model input format. Expected a dictionary with messages or a single prompt. Got: z Please update your dataset OR provide an input_mapper to convert the example.inputs to a compatible format for the llm or chat model you wish to evaluate.)	r   ri   r   r   rI   r6   r   r   r   )r   r   prompt_inputr;   r;   r<   +_validate_example_inputs_for_language_modelM  s8   

r   r   r.   c                 C  s   |r2|| j }t|j|}t|ts!td| dt| d|r0td|j d|  dS | j }t|j|}t	|dkrLt	|jdkrLdS |r[td|j d|  dS )	z<Validate that the example inputs match the chain input keys.zvWhen using an input_mapper to prepare dataset example inputs for a chain, the mapped value must be a dictionary.
Got: r   r   zAMissing keys after loading example using input_mapper.
Expected: z. Got: rK   zExample inputs missing expected chain input keys. Please provide an input_mapper to convert the example.inputs to a compatible format for the chain you wish to evaluate.Expected: N)
r   set
input_keys
differenceri   rj   r6   r   keysr   )r   r   r   first_inputsmissing_keysr;   r;   r<   "_validate_example_inputs_for_chainn  sB   

r   examplec                 C  sZ   t |trt| | dS | }t |trt| || dS t |tr+td|  dS dS )z9Validate that the example inputs are valid for the model.zSkipping input validation for N)ri   r   r   r.   r   r   r   debug)r   r   r   r   r;   r;   r<   _validate_example_inputs  s   


r   examplesList[Example]r3   "Optional[smith_eval.RunEvalConfig]	data_typer'   Optional[List[RunEvaluator]]c           	      C  s   |r>t | trd\}}d}nd}|  }t |tr|jnd}t |tr%|jnd}t||||d jr7t|d jnd||}|S d}|S )z<Configure the evaluators to run on the results of the chain.)NNllmr   Nr   )ri   r   r.   r   output_keys_load_run_evaluatorsoutputsr   )	r   r   r3   r   
run_inputsrun_outputsrun_typer   run_evaluatorsr;   r;   r<   _setup_evaluation  s&   
r   r   smith_eval.RunEvalConfigr   Optional[List[str]]Optional[str]c                 C  ~   d }| j r| j }|r||vrtd| d| d |S |r*t|dkr*|d }|S |d ur=t|dkr=td| d |S )Nz
Input key z% not in chain's specified input keys '. Evaluation behavior may be undefined.rK   r   z#Chain expects multiple input keys: z, Evaluator is likely to fail. Evaluation behavior may be undefined. Specify an input_key in the RunEvalConfig to avoid this warning.)	input_keyr   warningr   )r   r   r   r;   r;   r<   _determine_input_key  s$   
r   r   c                 C  r   )NzPrediction key z& not in chain's specified output keys r   rK   r   z$Chain expects multiple output keys: zl, Evaluation behavior may be undefined. Specify a prediction_key in the RunEvalConfig to avoid this warning.)prediction_keyr   r   r   )r   r   r   r;   r;   r<   _determine_prediction_key  s$   
r   example_outputsc                 C  sX   | j r| j }|r||vrtd| d| |S |r(t|dkr(t|d }|S d }|S )NzReference key z! not in Dataset example outputs: rK   r   )reference_keyr   r   r   )r   r   r   r;   r;   r<   _determine_reference_key  s   r   eval_configYUnion[smith_eval_config.SINGLE_EVAL_CONFIG_TYPE, smith_eval_config.CUSTOM_EVALUATOR_TYPE]eval_llmOptional[BaseLanguageModel]r   r   r   r   r"   c              	   C  s>  t | tr| S t | ttfr!t | tst| } t| |d}| j}	nBt | tjrRd|i|  }
t| j	fi |
}| j	j}	t | tj
rQ| jpF|}| jpK|}| jpP|}nt| rZt| S tdt|  t |tr|jrz|d u rztd|	 d| dtjj|||||||	gd}|S t |trtd|	 d	td|	 d
)N)r   r   zUnknown evaluator type: zPMust specify reference_key in smith_eval.RunEvalConfig to use evaluator of type z) with dataset with multiple output keys: r   )r   r   r   tagszRun evaluator for z is not implemented. PairwiseStringEvaluators compare the outputs of two different models rather than the output of a single model. Did you mean to use a StringEvaluator instead?
See: https://python.langchain.com/docs/guides/evaluation/string/z is not implemented)ri   r"   r0   r   r/   valuesmith_eval_config
EvalConfig
get_kwargsevaluator_typeSingleKeyEvalConfigr   r   r   r   run_evaluator_decr   r   r2   requires_reference
smith_evalStringRunEvaluatorChainfrom_run_and_data_typer1   NotImplementedError)r   r   r   r   r   r   r   r   
evaluator_eval_type_tagrx   r#   r;   r;   r<   _construct_run_evaluator  sZ   







	
r  2Tuple[Optional[str], Optional[str], Optional[str]]c                 C  s(   t | |}t| |}t| |}|||fS r   )r   r   r   )r   r   r   r   r   r   r   r;   r;   r<   	_get_keysI  s   



r  List[RunEvaluator]c                 C  s   g }d\}}}	| j s| jr!tdd | jD r!t| |||\}}}	| j D ]}
t|
| j||||	||}|| q$| jp<g }|D ]5}t|trL|| q?t|t	ra|t
jj||||||	d q?t|rm|t| q?td| d|S )z
    Load run evaluators from a configuration.

    Args:
        config: Configuration for the run evaluators.

    Returns:
        A list of run evaluators.
    NNNc                 S  s   g | ]}t |tqS r;   )ri   r2   )rF   ro   r;   r;   r<   rH   j  s    z(_load_run_evaluators.<locals>.<listcomp>)r   r   r   zUnsupported custom evaluator: z+. Expected RunEvaluator or StringEvaluator.)
evaluatorscustom_evaluatorsanyr  r  r   rl   ri   r"   r2   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r#   r  custom_evaluatorr;   r;   r<   r   U  sV   








r   r   	callbacksr   metadatar   r   r   r  r   r  Optional[Dict[str, Any]]Union[str, BaseMessage]c          
        s   |dur7||}t |tst |tr/tdd |D r/| j|t||p$g |p'i ddI dH S td| dzt|}| j|t||pDg |pGi ddI dH }W |S  tyw   t|}	| jd	i |	dt||phg |pki diI dH }Y |S w )
a  Asynchronously run the language model.

    Args:
        llm: The language model to run.
        inputs: The input dictionary.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.
        input_mapper: Optional function to map inputs to the expected format.

    Returns:
        The LLMResult or ChatResult.
    Raises:
        ValueError: If the LLM type is unsupported.
        InputFormatError: If the input format is invalid.
    Nc                 s  r   r   r   r   r;   r;   r<   r     r   z_arun_llm.<locals>.<genexpr>r  r   r  r   z%Input mapper returned invalid format 3
Expected a single string or list of chat messages.r   r;   )	ri   r   r   rI   ainvoker   r6   r   r   )
r   r   r   r  r   r  prompt_or_messagesr   
llm_output
llm_inputsr;   r;   r<   	_arun_llm  sP   r  r   r   r  Union[Chain, Runnable]Union[dict, str]c          
        s   |du r|n||}t | tr;t |tr;t|dkr;| jr;tt| }| j|t	||p.g |p1i ddI dH }|S t	|p?g ||pCi d}	| j||	dI dH }|S )z%Run a chain asynchronously on inputs.NrK   r  r   r   r  r  )
ri   r.   rj   r   r   r   r   r   r  r   
r   r   r  r   r   r  inputs_valrC   runnable_configr;   r;   r<   _arun_chain  s,   
r  )r   r   'Union[dict, str, LLMResult, ChatResult]c          	        s   t |trdnd}d}z8t |tr(t|| j|d |d ||ddI dH }n| }t|| j|d |d ||ddI dH }|}W |S  typ } z t| d| j	 d	| j d
t
|  t|d}W Y d}~|S d}~ww )a  Asynchronously run the Chain or language model.

    Args:
        example: The example to run.
        llm_or_chain_factory: The Chain or language model constructor to run.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.
        input_mapper: Optional function to map the input to the expected format.

    Returns:
        A list of outputs.
    LLMr.   Nr   r  r  r   failed for example  with inputs 
r`   )ri   r   r  r   rh   r  	Exceptionr   r   idreprrv   )	r   r   r   r   chain_or_llmrs   rC   r   ro   r;   r;   r<   _arun_llm_or_chain  sJ   
	r*  c          
      C  s   |dur5||}t |tst |tr-tdd |D r-| j|t||p#g |p&i dd}|S td| dzt|}| j|t||pBg |pEi dd}W |S  tyl   t|}	| jd
i |	dt||pci d	i}Y |S w )a  
    Run the language model on the example.

    Args:
        llm: The language model to run.
        inputs: The input dictionary.
        callbacks: The callbacks to use during the run.
        tags: Optional tags to add to the run.
        input_mapper: function to map to the inputs dictionary from an Example
    Returns:
        The LLMResult or ChatResult.
    Raises:
        ValueError: If the LLM type is unsupported.
        InputFormatError: If the input format is invalid.
    Nc                 s  r   r   r   r   r;   r;   r<   r   O  r   z_run_llm.<locals>.<genexpr>r  r   z'Input mapper returned invalid format:  r  r   )r  r  r;   )	ri   r   r   rI   invoker   r6   r   r   )
r   r   r  r   r   r  r  r  llm_promptsr  r;   r;   r<   _run_llm1  sL   
r-  Union[Dict, str]c          
      C  s   |du r|n||}t | tr7t |tr7t|dkr7| jr7tt| }| j|t	||p-g |p0i dd}|S t	|p;g ||p?i d}	| j||	d}|S )zRun a chain on inputs.NrK   r  r   r  )
ri   r.   rj   r   r   r   r   r   r+  r   r  r;   r;   r<   
_run_chaino  s*   
r/  c          
      C  s   t |trdnd}d}z2t |tr$t|| j|d |d ||dd}n| }t|| j|d |d ||dd}|}W |S  tyo } z&t|j}	t	
| d| j d	| j d
|	 d| 	 t|d}W Y d}~|S d}~ww )a  
    Run the Chain or language model synchronously.

    Args:
        example: The example to run.
        llm_or_chain_factory: The Chain or language model constructor to run.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.

    Returns:
        Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
          The outputs of the model or chain.
    r!  r.   Nr  r   r  r  r"  r#  z
Error Type: z, Message: r%  )ri   r   r-  r   rh   r/  r&  r   r7   r   r   r'  rv   )
r   r   r   r   r)  rs   rC   r   ro   
error_typer;   r;   r<   _run_llm_or_chain  sN   
		
r1  clientr   project_nameproject_metadatadataset_versionOptional[Union[str, datetime]]1Tuple[MCF, TracerSession, Dataset, List[Example]]c              
   C  sb  t ||}| j|d}t| j|j|d}	|	std| ddd |	D }
|
r,t|
nd }|r4| nd }z'|p:i }t }|rHi |d|i}||d< | j	||j|rWd	|ini |d
}W n1 t
ttfy } z"dt|vrp|t }d| d| d| d}td| d| d }~ww |jd|j  }td| d| d| d|j dd ||||	fS )N)r   )
dataset_idas_ofzDataset z has no example rows.c                 S  s   g | ]}|j r|j qS r;   )modified_at)rF   exr;   r;   r<   rH     s    z%_prepare_eval_run.<locals>.<listcomp>gitr5  r   )reference_dataset_idproject_extrar  zalready exists z+
run_on_dataset(
    ...
    project_name="z - z", # Update since z already exists
)
zTest project z/ already exists. Please use a different name:

z/compare?selectedSessions=z)View the evaluation results for project 'z' at:
z

View all tests for Dataset z at:
T)flush)r   read_datasetr   list_examplesr'  r   max	isoformatr   create_projectr,   r+   r   uuiduuid4urlprint)r2  r   r   r3  r4  r   r5  wrapped_modeldatasetr   r:  max_modified_atinferred_versiongit_infoprojectro   uidexample_msgcomparison_urlr;   r;   r<   _prepare_eval_run  sl   
	
rR  c                   @  s*   e Zd ZU dZded< ded< ded< dS )	
_RowResultz5A dictionary of the results for a single example row.z Optional[List[EvaluationResult]]rV   zOptional[float]ra   r   rb   Nr   r;   r;   r;   r<   rS    s
   
 rS  F)totalc                   @  s   e Zd ZU dZded< ded< ded< ded	< d
ed< dZded< d>ddZd?ddZd@ddZdAd!d"Z	dBdCd&d'Z
e				(			dDdEd<d=ZdS )F_DatasetRunContainerz3A container to help manage the state of a eval run.r   r2  r*   rN  r   rI  r   r   zList[RunnableConfig]configsNz6Optional[List[smith_eval_config.BATCH_EVALUATOR_LIKE]]batch_evaluatorsbatch_resultsr   all_eval_resultsDict[str, _RowResult]r>   rj   c                 C  s   i }t | j|D ]M\}}tt|t|ji }|j|dg |d|dd|t|j< t|t	r?|j
|t|j d< n	||t|j d< |jrU|j|t|j d< q|S )NrV   ra   rb   )rB   rV   ra   rb   r`   rC   rD   )zipr   r   rS  rh   r   r'  r   ri   rv   r`   r   )rQ   rX  rY  rU   r   rC   
row_resultr;   r;   r<   _merge_test_outputs  s   

z(_DatasetRunContainer._merge_test_outputsrunsDict[str, Run]
List[dict]c           	        s   | j }|sg S  fdd| jD }g }tj X}|D ]L}z+||| j}t|tr-| }|t	t| |j
| jjfi |d | jjd W q tyg } ztdt| d|  W Y d }~qd }~ww W d    |S 1 ssw   Y  |S )Nc                   s   g | ]	} t |j qS r;   )r   r'  rF   r   r^  r;   r<   rH   1  s    z>_DatasetRunContainer._run_batch_evaluators.<locals>.<listcomp>)rb   
project_idzError running batch evaluator z: )rW  r   
concurrentfuturesThreadPoolExecutorri   r!   rj   rl   r   submitr2  create_feedbackrN  r'  r&  r   rc   r(  )	rQ   r^  r  	runs_listaggregate_feedbackexecutor	evaluatorrs   ro   r;   rb  r<   _run_batch_evaluators-  s>   

z*_DatasetRunContainer._run_batch_evaluators,Tuple[Dict[str, _RowResult], Dict[str, Run]]c                 C  s   i }i }| j D ]d}tt|d D ]Z}t|tr3|j}| D ]\\}}}|t|i 	d|i qqt|t
rj|j}	|	rH|	jrH|	j|	j  nd }
|	rQt|	jnd }|t|ji 	|
||	d |	|t|j< qqttttf ||fS )Nr  rV   )ra   rb   run)rV  r   r   ri   r   logged_eval_resultsrg   
setdefaultr   rk   r   
latest_runend_time
start_timetotal_secondsr'  rr   r   rS  )rQ   rY  all_runsccallbackeval_results_rr   rZ   ro  ra   rb   r;   r;   r<   _collect_metricsF  s<   


z%_DatasetRunContainer._collect_metrics-List[Union[dict, str, LLMResult, ChatResult]]r=   c                 C  sX   t d t  |  \}}d }| jrt d | |}| ||}t| jj	||dS )Nz#Waiting for evaluators to complete.zRunning session evaluators.)r3  rU   aggregate_metrics)
r   r   r   r{  rW  rm  r]  r=   rN  r~   )rQ   rX  rY  rv  rj  rU   r;   r;   r<   _collect_test_resultsc  s   


z*_DatasetRunContainer._collect_test_resultsFverboseboolc              
   C  s   |  |}|r.z
| }t| W n ty- } ztdt|  W Y d }~nd }~ww z| jj| j	j
ttjd W |S  ty\ } ztdt|  W Y d }~|S d }~ww )Nz$Failed to print aggregate feedback: )rs  zFailed to close project: )r~  rT   _display_aggregate_resultsr&  r   r   r(  r2  update_projectrN  r'  r   nowr   utc)rQ   rX  r  rU   agg_feedbackro   r;   r;   r<   finishu  s&   
 z_DatasetRunContainer.finish   r   r   r   r   r3  r   r3   r   r   r   r   r   concurrency_levelintr4  r  revision_idr5  Optional[Union[datetime, str]]c              	     s  |pt  }|
r|	si }	|	d|
i t ||||	|d\}}}p%g jdp-i  D ]\}}d| d|  q0djd i|
rM|
d< t|}t	||||j
pZtjt|d || tt| fdd	|D }|  ||||r|jd
S d d
S )Nr  )r4  r   r5  r<  zgit:=r5  r   c              
     sB   g | ]}t tj |jd tpg  |jddgdqS ))r3  r2  rr   r   )r  r2  rr   max_concurrency)r  r   r  r  )r   r   r~   r'  r   ra  r2  r  progress_barrN  r   run_metadatar   r;   r<   rH     s*    z0_DatasetRunContainer.prepare.<locals>.<listcomp>)r2  rN  rI  r   rV  rW  )r4   random_namerk   rR  r  rh   rg   rl   r   r   r   r'   kvr   r5   ProgressBarCallbackr   rW  )clsr2  r   r   r3  r3   r   r   r  r4  r  r5  rI  rJ  r   rY   rZ   rV  r;   r  r<   prepare  sN   	z_DatasetRunContainer.prepare)rX  r   rY  rZ  r>   rj   )r^  r_  r>   r`  )r>   rn  )rX  r|  r>   r=   )F)rX  r   r  r  r>   r=   )NNNr  NNN)r2  r   r   r   r   r   r3  r   r3   r   r   r   r   r   r  r  r4  r  r  r   r5  r  r>   rU  )r7   r8   r9   r:   r   rW  r]  rm  r{  r~  r  classmethodr  r;   r;   r;   r<   rU    s,   
 



rU  r  c                  C  sD   zddl m}  |  }|  d uodtt|v W S  ty!   Y dS w )Nr   )get_ipythonzmqshellF)IPythonr  r   r   rf   )r  resr;   r;   r<   _is_jupyter_environment  s   r  aggregate_resultsr?   c                 C  sT   t  rddlm}m} ||d ||  d S | jdd dd}td t| d S )	Nr   )HTMLdisplayz<h3>Experiment Results:</h3>c                 S  s   | dS )Nz.2fr;   )xr;   r;   r<   r     r   z,_display_aggregate_results.<locals>.<lambda>right)float_formatjustifyz
 Experiment Results:)r  IPython.displayr  r  	to_stringrH  )r  r  r  formatted_stringr;   r;   r<   r    s   r  a  The input_mapper argument is deprecated and will be removed in a future release. Please add a  RunnableLambda to your chain to map inputs to the expected format instead. Example:
def construct_chain():
    my_chain = ...
    input_mapper = {'other_key': 'MyOtherInput', 'my_input_key': x}
    return input_mapper | my_chain
run_on_dataset(..., llm_or_chain_factory=construct_chain)
(See https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.base.RunnableLambda.html)r  )r3   r5  r  r3  r4  r  r  Optional[Client]r  r  r  r  r  rx   r   c                  s   |
 dd }|rtdtdd |	d u rt d}	|
 dd }|r)tdddd |
r8tdd	|
  d
dd | p<t } tj| |||||||||	|d}t	j
|jd dgttjt|j|d|j|jR  I d H }|j||dS )Nr   0.0.305Tmessagependingr  r   0.1.9qThe tags argument is deprecated and will be removed in a future release. Please specify project_metadata instead.PThe following arguments are deprecated and will be removed in a future release: r   r  removalr4  r  r5  r   r  r   r   r  )r   r   _INPUT_MAPPER_DEP_WARNINGr    rh   r   r   rU  r  runnable_utilsgather_with_concurrencyrV  map	functoolspartialr*  rI  r   r  )r2  r   r   r3   r5  r  r3  r4  r  r  rx   r   r   	containerrX  r;   r;   r<   arun_on_dataset  sb   
r  c                  s"  |
 dd rtdtdd |
 dd }|rtdddd |	d u r(t d}	|
r7tdd	|
  d
dd | p;t } tj| ||||||||	|d |dkr` fddt	 j
 jD }n*t jd }t|tjt jd j
 j}W d    n1 sw   Y   j||dS )Nr   r  Tr  r   r  r  r  r  r   r  r  r   c                   s"   g | ]\}}t || jd qS )r  )r1  rI  )rF   r   r   r  r   r;   r<   rH   l  s    z"run_on_dataset.<locals>.<listcomp>r  r  )r   r   r  r    rh   r   r   rU  r  r[  r   rV  r  get_executor_for_configr   r  r  r  r1  rI  r  )r2  r   r   r3   r5  r  r3  r4  r  r  rx   r   rX  rk  r;   r  r<   run_on_dataset9  sh   

r  a1  
Run the Chain or language model on a dataset and store traces
to the specified project name.

Args:
    dataset_name: Name of the dataset to run the chain on.
    llm_or_chain_factory: Language model or Chain constructor to run
        over the dataset. The Chain constructor is used to permit
        independent calls on each example without carrying over state.
    evaluation: Configuration for evaluators to run on the
        results of the chain
    concurrency_level: The number of async tasks to run concurrently.
    project_name: Name of the project to store the traces in.
        Defaults to {dataset_name}-{chain class name}-{datetime}.
    project_metadata: Optional metadata to add to the project.
        Useful for storing information the test variant.
        (prompt version, model version, etc.)
    client: LangSmith client to use to access the dataset and to
        log feedback and run traces.
    verbose: Whether to print progress.
    tags: Tags to add to each run in the project.
    revision_id: Optional revision identifier to assign this test run to
        track the performance of different versions of your system.
Returns:
    A dictionary containing the run's project name and the resulting model outputs.


For the (usually faster) async version of this function, see :func:`arun_on_dataset`.

Examples
--------

.. code-block:: python

    from langsmith import Client
    from langchain_openai import ChatOpenAI
    from langchain.chains import LLMChain
    from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset

    # Chains may have memory. Passing in a constructor function lets the
    # evaluation framework avoid cross-contamination between runs.
    def construct_chain():
        llm = ChatOpenAI(temperature=0)
        chain = LLMChain.from_string(
            llm,
            "What's the answer to {your_input_key}"
        )
        return chain

    # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
    evaluation_config = smith_eval.RunEvalConfig(
        evaluators=[
            "qa",  # "Correctness" against a reference answer
            "embedding_distance",
            smith_eval.RunEvalConfig.Criteria("helpfulness"),
            smith_eval.RunEvalConfig.Criteria({
                "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
            }),
        ]
    )

    client = Client()
    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )

You can also create custom evaluators by subclassing the
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
or LangSmith's `RunEvaluator` classes.

.. code-block:: python

    from typing import Optional
    from langchain.evaluation import StringEvaluator

    class MyStringEvaluator(StringEvaluator):

        @property
        def requires_input(self) -> bool:
            return False

        @property
        def requires_reference(self) -> bool:
            return True

        @property
        def evaluation_name(self) -> str:
            return "exact_match"

        def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
            return {"score": prediction == reference}


    evaluation_config = smith_eval.RunEvalConfig(
        custom_evaluators = [MyStringEvaluator()],
    )

    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )
zrun_on_dataset(zawait arun_on_dataset()r   )r   r   r   r   r>   r   )r   r   r>   r   )r   r   r>   rj   )r   r(   r   r   r>   ry   )r   r(   r   r.   r   r   r>   ry   )r   r(   r   r   r   r   r>   ry   )
r   r   r   r   r3   r   r   r'   r>   r   )r   r   r   r   r>   r   )r   r   r   r   r>   r   )r   r   r   r   r>   r   )r   r   r   r   r   r   r   r'   r   r   r   r   r   r   r   r   r>   r"   )
r   r   r   r   r   r   r   r   r>   r  )r   r   r   r   r   r'   r   r   r   r   r   r   r>   r  )r   r   r   r   r   r   r  r   r   r   r  r  r>   r  )r   r  r   r   r  r   r   r   r   r   r  r  r>   r  )
r   r(   r   r   r   r   r   r   r>   r   )r   r   r   r   r  r   r   r   r   r   r  r  r>   r  )r   r  r   r   r  r   r   r   r   r   r  r  r>   r.  r  )r2  r   r   r   r   r   r3  r   r4  r  r   r   r5  r6  r>   r7  )r>   r  )r  r?   r>   ry   )r2  r  r   r   r   r   r3   r   r5  r  r  r  r3  r   r4  r  r  r  r  r   rx   r   r>   r   )r:   
__future__r   concurrent.futuresrd  dataclassesr  r   loggingrE  r   r   typingr   r   r   r   r	   r
   r   r   r   langchain_core._apir    langchain_core.callbacks.managerr   langchain_core.language_modelsr   langchain_core.messagesr   r   langchain_core.outputsr   r   langchain_core.runnablesr   r   r   r   r  r   r  !langchain_core.tracers.evaluationr   r    langchain_core.tracers.langchainr   langsmith.clientr   langsmith.envr   r    langsmith.evaluationr!   r"   r#   r   langsmith.run_helpersr$   r%   langsmith.schemasr&   r'   r(   r)   r*   langsmith.utilsr+   requestsr,   typing_extensionsr-   langchain.chains.baser.   langchain.evaluation.loadingr/   langchain.evaluation.schemar0   r1   r2   langchain.smithr3   r   langchain.smith.evaluationr   r4   r5   re   rn   	getLoggerr7   r   rj   r   r   r&  r6   r=   rv   r   r   r   r   r   r   r   r   r   r   r   r  r  r   r  r  r*  r-  r/  r1  rR  rS  	dataclassrU  r  r  r  r  r  _RUN_ON_DATASET_DOCSTRINGreplacer;   r;   r;   r<   <module>   s    ,
I
=3


(
!
%





C
GE%>C%== 
C

FMk
