LLM_only_EVAL.py

import json
import os
import base64
from PIL import Image
import multiprocessing
from openai import OpenAI

client = OpenAI()

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def evaluate_insights_comparison(
    dataset_id,
    with_skills_hash,
    without_skills_hash,
    persona="You are a helpful AI assistant",
):
    """
    Compare insights from two different runs (with and without skills) for a given dataset
    and evaluate their relevance to the goal.

    Args:
        dataset_id (str): The dataset identifier
        with_skills_hash (str): Hash value for the run with skills
        without_skills_hash (str): Hash value for the run without skills
        persona (str): The persona to use for evaluation

    Returns:
        tuple: (evaluation_prompt, llm_response)
    """
    # Read the goal
    goal_path = f"data/jsons/{dataset_id}/goal.json"
    try:
        with open(goal_path, "r") as f:
            goal_data = json.load(f)
            goal = goal_data.get("goal", "")
    except Exception as e:
        return str(e), True

    # Read insights with skills
    with_skills_path = (
        f"results/insights_w_skills/{with_skills_hash}/{dataset_id}/final_insight.txt"
    )
    try:
        with open(with_skills_path, "r") as f:
            with_skills_insight = f.read().strip()
    except Exception as e:
        return str(e), True
    
    with_skills_image_path = f"results/insights_w_skills/{with_skills_hash}/{dataset_id}/plot.jpeg"
    try:
        with_skills_image = encode_image(with_skills_image_path)
    except Exception as e:
        return str(e), True
    
    # Read insights without skills
    without_skills_path = f"results/insights_wo_skills/{without_skills_hash}/{dataset_id}/final_insight.txt"
    try:
        with open(without_skills_path, "r") as f:
            without_skills_insight = f.read().strip()
    except Exception as e:
        return str(e), True
    
    without_skills_image_path = f"results/insights_wo_skills/{with_skills_hash}/{dataset_id}/plot.jpeg"
    try:
        without_skills_image = encode_image(without_skills_image_path)
    except Exception as e:
        return str(e), True
    # Construct the evaluation prompt
    evaluation_prompt = f"""{persona}

Given are two insights and respective plots, Insight A and Insight B generated by two different methods in response to an analytics question. Analyze the following insights and determine which one is better based on the given criteria. 

    Criteria:
    1. Depth of Analysis: Evaluate the extent to which each insight delves into the details of the data, explores multiple factors, and provides a comprehensive understanding. Consider the complexity and sophistication of the analysis methods used in each insight. Also, assess whether the insights provide a nuanced understanding of the data, explore underlying patterns, or reveal unexpected findings.
    2. Relevance to Goal: Assess how directly each insight addresses the stated goal. Evaluate how well each insight aligns with the goal and consider whether the insight provides actionable recommendations or strategies that directly address the goal. Also, evaluate whether the insights directly contribute to achieving the stated goal.
    3. Persona Consistency: Consider how well each insight aligns with the persona's values, goals, and characteristics. Evaluate whether the tone, language, and approach used in each insight align with the persona's stated experience and expertise. Also, assess whether the insights are engaging and relatable to the persona.
    4. Coherence: Evaluate how coherent and cohesive is the analysis. Assess whether the insight presents information in a logical flow, makes clear connections between points, and avoids unnecessary jargon or complexity.
    5. Answers Question Adequately: Ensure that the insight fully answers the question, addressing all aspects and providing a comprehensive answer. Consider whether the insight provides additional relevant information that goes beyond the scope of the question and provides additional insights or information that could be helpful to the user.
    6. Plot Conclusion: Look for a clear and concise conclusion that summarizes the key points of the analysis and clearly states the final decision or recommendation. Evaluate whether the conclusion provides a satisfying or insightful end to the analysis, provides a clear summary of the key points, ties up all loose ends, and provides a sense of closure.
    
    For each criterion, respond with "A is better", "B is better", "Tie", or "None". 

    Give the response in the form of a python dictionary with keys depth_of_analysis, relevance_to_goal, persona_consistency, coherence, answers_question_adequately, plot_conclusion. Additionally, provide a brief explanation for each score, explaining why you chose a particular score for each criterion, and provide specific examples from the insights to support your scoring decisions. 

sample_response: {{
    "depth_of_analysis": "A is better", 
    "relevance_to_goal": "Tie", 
    "persona_consistency": "Tie", 
    "coherence": "Tie", 
    "answers_question_adequately": "B is better", 
    "plot_conclusion": "B is better", 
    "depth_of_analysis_explanation": "Insight A provides more detailed statistical analysis with specific percentages and explores multiple factors affecting the outcome", 
    "relevance_to_goal_explanation": "Both insights address the main objective equally well by identifying key patterns in the data", 
    "persona_consistency_explanation": "Both insights maintain a consistent analytical tone appropriate for the target audience", 
    "coherence_explanation": "Both insights present information in a logical flow with clear connections between points", 
    "answers_question_adequately_explanation": "Insight B provides more comprehensive coverage of all aspects mentioned in the question", 
    "plot_conclusion_explanation": "Insight B offers a more concise and clear summary of the key trends shown in the visualization"
}}

Goal:
{goal}

Persona: {persona}

Model A (With Skills):
Insight: {with_skills_insight}
Plot: Image 1

Model B (Without Skills):
Insight: {without_skills_insight}
Plot: Image 2

"""

    # Get response from LLM
    try:
        response = client.chat.completions.create(
            model="gpt-4o",  # or your preferred model
            messages=[{"role": "user", 
            "content": [{"type":"text","text":evaluation_prompt},
            {"type": "image_url", "image_url": {"url":f"data:image/png;base64,{with_skills_image}"}},
            {"type": "image_url", "image_url": {"url":f"data:image/png;base64,{without_skills_image}"}}
            ]}],
            temperature=0.0,
        )
        llm_response = response.choices[0].message.content
    except Exception as e:
        raise Exception(f"Error getting LLM response: {str(e)}")

    return llm_response, False


def get_all_dataset_ids(with_skills_hash, without_skills_hash):
    """
    Get all dataset IDs that exist in both with_skills and without_skills directories.
    Only returns IDs that are purely numerical.
    """
    with_skills_path = os.path.join("results/insights_w_skills", with_skills_hash)
    without_skills_path = os.path.join("results/insights_wo_skills", without_skills_hash)
    # Get datasets that exist in both directories
    with_skills_datasets = (
        set(os.listdir(with_skills_path)) if os.path.exists(with_skills_path) else set()
    )
    without_skills_datasets = (
        set(os.listdir(without_skills_path))
        if os.path.exists(without_skills_path)
        else set()
    )

    # Filter for numerical-only IDs
    common_datasets = with_skills_datasets.intersection(without_skills_datasets)
    numerical_datasets = {
        dataset_id for dataset_id in common_datasets if dataset_id.isdigit()
    }

    return list(numerical_datasets)

def get_results(dataset_id, with_skills_hash, without_skills_hash):
    """Process one dataset: evaluates insights and writes the LLM response."""
    print(f"\nProcessing dataset: {dataset_id}")
    
    # Call the evaluation function for the dataset
    llm_response, error = evaluate_insights_comparison(
        dataset_id, with_skills_hash, without_skills_hash
    )
    
    if error:
        print(f"There was an error processing {dataset_id}: {llm_response}")
        return (dataset_id, False)
    
    # Create the output directory if it doesn't exist
    output_dir = f"results/evaluations/GPTEval/pilot/{dataset_id}"
    os.makedirs(output_dir, exist_ok=True)
    
    # Save the LLM response to file
    llm_response_path = os.path.join(output_dir, "llm_response.txt")
    with open(llm_response_path, "w") as f:
        f.write(llm_response)
    
    print(f"LLM response saved to: {llm_response_path}")
    return (dataset_id, True)

def main():
    # Example usage results/insights_w_skills/Superbatch1
    with_skills_hash = "Superbatch1"
    without_skills_hash = "Superbatch1"

    try:
        dataset_ids = get_all_dataset_ids(with_skills_hash, without_skills_hash)
        # dataset_ids = list(range(50, 52))
        if not dataset_ids:
            raise Exception("No matching datasets found in both directories")
        tasks = [
            (dataset_id, with_skills_hash, without_skills_hash) 
            for dataset_id in dataset_ids
            ]
        with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
            results = pool.starmap(get_results, tasks)
    except Exception as e:
        print(f"Error: {str(e)}")


if __name__ == "__main__":
    main()