13. Repeat evaluation

Repeat evaluation

You can add iterations to the experiment.

This is useful when you can repeat the evaluation multiple times:

For larger evaluation sets
For chains that can generate variable responses
Assessments that can generate variable scores (e.g. llm-as-judge )

Reference

https://docs.smith.langchain.com/how_to_guides/evaluation/evaluate_llm_application#evaluate-on-a-dataset-with-repetitions

# installation
# !pip install -qU langsmith langchain-teddynotew

# Configuration file for managing API KEY as environment variable
from dotenv import load_dotenv

# Load API KEY information
load_dotenv()

 True

# Set up LangSmith tracking. https://smith.langchain.com
# !pip install -qU langchain-teddynote
from langchain_teddynote import logging

# Enter a project name.
logging.langsmith("CH16-Evaluations")

 Start tracking LangSmith. 
[Project name] 
CH16-Evaluations

Define functions for RAG performance testing

from myrag import PDFRAG


# Create a function that answers the question
def ask_question_with_llm(llm):
    # Creating a PDFRAG object
    rag = PDFRAG(
        "data/SPRI_AI_Brief_2023년12월호_F.pdf",
        llm,
    )

    # Create a retriever
    retriever = rag.create_retriever()

    # Create a chain
    rag_chain = rag.create_chain(retriever)

    def _ask_question(inputs: dict):
        # Contextual search for questions
        context = retriever.invoke(inputs["question"])
        # Combine the searched documents into one string
        context = "\n".join([doc.page_content for doc in context])
        # Returns a dictionary containing the question, context, and answer.
        return {
            "question": inputs["question"],
            "context": context,
            "answer": rag_chain.invoke(inputs["question"]),
        }

    return _ask_question

from langchain_openai import ChatOpenAI
from langchain_community.chat_models import ChatOllama


gpt_chain = ask_question_with_llm(ChatOpenAI(model="gpt-4o-mini", temperature=1.0))

# Import the Ollama model.
ollama_chain = ask_question_with_llm(
    ChatOllama(model="EEVE-Korean-10.8B:latest", temperature=1.0)
)

Repeat evaluation for RAG using GPT model

from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Create a qa evaluator
cot_qa_evalulator = LangChainStringEvaluator(
    "cot_qa",
    config={"llm": ChatOpenAI(model="gpt-4o-mini", temperature=0)},
    prepare_data=lambda run, example: {
        "prediction": run.outputs["answer"],
        "reference": run.outputs["context"],
        "input": example.inputs["question"],
    },
)

dataset_name = "RAG_EVAL_DATASET"

# Running the evaluation
evaluate(
    gpt_chain,
    data=dataset_name,
    evaluators=[cot_qa_evalulator],
    experiment_prefix="REPEAT_EVAL",
    # Specifying Experiment Metadata
    metadata={
        "variant": "Perform repeat evaluation. GPT-4o-mini model (cot_qa)",
    },
    num_repetitions=3,
)

Repeat evaluation for RAG using Ollama model

from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Create a qa evaluator
cot_qa_evalulator = LangChainStringEvaluator(
    "cot_qa",
    config={"llm": ChatOpenAI(model="gpt-4o-mini", temperature=0)},
    prepare_data=lambda run, example: {
        "prediction": run.outputs["answer"],
        "reference": run.outputs["context"],
        "input": example.inputs["question"],
    },
)

dataset_name = "RAG_EVAL_DATASET"

# Running the evaluation
evaluate(
    ollama_chain,
    data=dataset_name,
    evaluators=[cot_qa_evalulator],
    experiment_prefix="REPEAT_EVAL",
    # Specifying Experiment Metadata
    metadata={
        "variant": "Perform Repeat Evaluation. EEVE-Korean-10.8B model (cot_qa)",
    },
    num_repetitions=3,
)

Previous12. 실험 비교(Pairwise Evaluation)Next14. Automating evaluation using online evaluation

Last updated 1 year ago

hashtagRepeat evaluation

hashtagDefine functions for RAG performance testing

hashtagRepeat evaluation for RAG using GPT model

hashtagRepeat evaluation for RAG using Ollama model

Repeat evaluation

Define functions for RAG performance testing

Repeat evaluation for RAG using GPT model

Repeat evaluation for RAG using Ollama model