11. Groundedness (Halucination) Assessment

Groundedness Evaluator

Evaluator that evaluates whether an answer is correct based on a given context.

This Evaluator can be used to evaluate Hallucination for RAG's answer.

In this tutorial, we will look at how to evaluate Groundedness by utilizing the Upstage Groundness Checker and the Groundness Checker created by the job custom.

# installation
# !pip install -qU langsmith langchain-teddynote

# Configuration file for managing API KEY as environment variable
from dotenv import load_dotenv

# Load API KEY information
load_dotenv()

 True

# Set up LangSmith tracking. https://smith.langchain.com
# !pip install -qU langchain-teddynote
from langchain_teddynote import logging

# Enter a project name.
logging.langsmith("CH16-Evaluations")

 Start tracking LangSmith. 
[Project name] 
CH16-Evaluations

Define functions for RAG performance testing

We will create a RAG system to use for testing.

from myrag import PDFRAG


# Create a function that answers the question
def ask_question_with_llm(llm):
    # Creating a PDFRAG object
    rag = PDFRAG(
        "data/SPRI_AI_Brief_December 2023 issue_F.pdf",
        llm,
    )

    # Create a retriever
    retriever = rag.create_retriever()

    # Create a chain
    rag_chain = rag.create_chain(retriever)

    def _ask_question(inputs: dict):
        # Contextual search for questions
        context = retriever.invoke(inputs["question"])
        # Combine the searched documents into one string
        context = "\n".join([doc.page_content for doc in context])
        # Returns a dictionary containing the question, context, and answer.
        return {
            "question": inputs["question"],
            "context": context,
            "answer": rag_chain.invoke(inputs["question"]),
        }

    return _ask_question

from langchain_openai import ChatOpenAI

gpt_chain = ask_question_with_llm(ChatOpenAI(model="gpt-4o-mini", temperature=0))

UpstageGroundednessCheck

In order to take advantage of Upstage's Groundedness Check feature, you must be issued an API key from the link below.

API key issuance

from langchain_upstage import UpstageGroundednessCheck

# Creating an Upstage Groundness Checker
upstage_groundedness_check = UpstageGroundednessCheck()

# Run Groundness Checker to evaluate
request_input = {
    "context": "Teddy is a male and runs the Teddy Note YouTube channel.",
    "answer": "Teddy is a guy.",
}

response = upstage_groundedness_check.invoke(request_input)
print(response)

 grounded

# Run Groundness Checker to evaluate
request_input = {
    "context": "Teddy is a male and runs the Teddy Note YouTube channel.",
    "answer": "Teddy is a girl.",
}

response = upstage_groundedness_check.invoke(request_input)
print(response)

 notGrounded

Defines UpstageGroundednessCheck Evaluator. Later, it is utilized by the Evaluate function.

from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate


def upstage_groundness_check_evaluator(run: Run, example: Example) -> dict:
    # LLM Generate Answers, Get Correct Answers
    answer = run.outputs.get("answer", "")
    context = run.outputs.get("context", "")

    # Groundness check
    groundedness_score = upstage_groundedness_check.invoke(
        {"answer": answer, "context": context}
    )
    groundedness_score = groundedness_score == "grounded"

    return {"key": "groundness_score", "score": int(groundedness_score)}

langchain_teddynote Groundness Checker

Utilize OpenAI's model to create a custom Groundness Checker.

Use the OpenAI model to check Groundedness.

from langsmith.schemas import Run, Example
from langchain_teddynote.evaluator import GroundnessChecker
from langchain_openai import ChatOpenAI

# teddynote Groundness Checker 생성
groundedness_check = GroundnessChecker(
    ChatOpenAI(model="gpt-4o-mini", temperature=0)
).create()


def teddynote_groundness_check_evaluator(run: Run, example: Example) -> dict:
    # LLM Generate Answers, Get Correct Answers
    answer = run.outputs.get("answer", "")
    context = run.outputs.get("context", "")

    # Groundness check
    groundedness_score = groundedness_check.invoke(
        {"answer": answer, "context": context}
    )
    groundedness_score = groundedness_score.score == "yes"

    return {"key": "groundness_score", "score": int(groundedness_score)}

Run Groundedness assessment.

from langsmith.evaluation import evaluate

# Set dataset name
dataset_name = "RAG_EVAL_DATASET"

# execution
experiment_results = evaluate(
    gpt_chain,
    data=dataset_name,
    evaluators=[
        upstage_groundness_check_evaluator,
        teddynote_groundness_check_evaluator,
    ],
    experiment_prefix="GROUNDEDNESS-EVAL",
    # Specifying Experiment Metadata
    metadata={
        "variant": "Upstage & teddynote Groundness Checker 를 활용한 Hallucination 평가",
    },
)

Comprehensive evaluation of datasets using Summary Evaluators

This is useful when running Groundedness ratings for the entire dataset. (The previous step was to evaluate the individual data.)

from typing import List
from langsmith.schemas import Example, Run


def upstage_groundness_check_summary_evaluator(
    runs: List[Run], examples: List[Example]
) -> dict:
    def is_grounded(run: Run) -> bool:
        context = run.outputs["context"]
        answer = run.outputs["answer"]
        return (
            upstage_groundedness_check.invoke({"context": context, "answer": answer})
            == "grounded"
        )

    groundedness_scores = sum(1 for run in runs if is_grounded(run))
    return {"key": "groundness_score", "score": groundedness_scores / len(runs)}


def teddynote_groundness_check_summary_evaluator(
    runs: List[Run], examples: List[Example]
) -> dict:
    def is_grounded(run: Run) -> bool:
        context = run.outputs["context"]
        answer = run.outputs["answer"]
        return (
            groundedness_check.invoke({"context": context, "answer": answer}).score
            == "yes"
        )

    groundedness_scores = sum(1 for run in runs if is_grounded(run))
    return {"key": "groundness_score", "score": groundedness_scores / len(runs)}

from langsmith.evaluation import evaluate

# Running the evaluation
experiment_result1 = evaluate(
    gpt_chain,
    data=dataset_name,
    summary_evaluators=[
        upstage_groundness_check_summary_evaluator,
    ],
    experiment_prefix="GROUNDNESS_UPSTAGE_SUMMARY_EVAL",
    # Specifying Experiment Metadata
    metadata={
        "variant": "Upstage Groundness Checker 를 활용한 Hallucination 평가",
    },
)

# Running the evaluation
experiment_result2 = evaluate(
    gpt_chain,
    data=dataset_name,
    summary_evaluators=[
        teddynote_groundness_check_summary_evaluator,
    ],
    experiment_prefix="GROUNDNESS_TEDDYNOTE_SUMMARY_EVAL",
    # Specifying Experiment Metadata
    metadata={
        "variant": "Teddynote Groundness Checker 를 활용한 Hallucination 평가",
    },
)

Previous10. Assessment of the summary method Next12. 실험 비교(Pairwise Evaluation)

Last updated 1 year ago

hashtagGroundedness Evaluator

hashtagDefine functions for RAG performance testing

hashtagUpstageGroundednessCheck

hashtaglangchain_teddynote Groundness Checker

hashtagComprehensive evaluation of datasets using Summary Evaluators

Groundedness Evaluator

Define functions for RAG performance testing

UpstageGroundednessCheck

langchain_teddynote Groundness Checker

Comprehensive evaluation of datasets using Summary Evaluators