-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLLM_only_EVAL.py
207 lines (172 loc) · 9.49 KB
/
LLM_only_EVAL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import json
import os
import base64
from PIL import Image
import multiprocessing
from openai import OpenAI
client = OpenAI()
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def evaluate_insights_comparison(
dataset_id,
with_skills_hash,
without_skills_hash,
persona="You are a helpful AI assistant",
):
"""
Compare insights from two different runs (with and without skills) for a given dataset
and evaluate their relevance to the goal.
Args:
dataset_id (str): The dataset identifier
with_skills_hash (str): Hash value for the run with skills
without_skills_hash (str): Hash value for the run without skills
persona (str): The persona to use for evaluation
Returns:
tuple: (evaluation_prompt, llm_response)
"""
# Read the goal
goal_path = f"data/jsons/{dataset_id}/goal.json"
try:
with open(goal_path, "r") as f:
goal_data = json.load(f)
goal = goal_data.get("goal", "")
except Exception as e:
return str(e), True
# Read insights with skills
with_skills_path = (
f"results/insights_w_skills/{with_skills_hash}/{dataset_id}/final_insight.txt"
)
try:
with open(with_skills_path, "r") as f:
with_skills_insight = f.read().strip()
except Exception as e:
return str(e), True
with_skills_image_path = f"results/insights_w_skills/{with_skills_hash}/{dataset_id}/plot.jpeg"
try:
with_skills_image = encode_image(with_skills_image_path)
except Exception as e:
return str(e), True
# Read insights without skills
without_skills_path = f"results/insights_wo_skills/{without_skills_hash}/{dataset_id}/final_insight.txt"
try:
with open(without_skills_path, "r") as f:
without_skills_insight = f.read().strip()
except Exception as e:
return str(e), True
without_skills_image_path = f"results/insights_wo_skills/{with_skills_hash}/{dataset_id}/plot.jpeg"
try:
without_skills_image = encode_image(without_skills_image_path)
except Exception as e:
return str(e), True
# Construct the evaluation prompt
evaluation_prompt = f"""{persona}
Given are two insights and respective plots, Insight A and Insight B generated by two different methods in response to an analytics question. Analyze the following insights and determine which one is better based on the given criteria.
Criteria:
1. Depth of Analysis: Evaluate the extent to which each insight delves into the details of the data, explores multiple factors, and provides a comprehensive understanding. Consider the complexity and sophistication of the analysis methods used in each insight. Also, assess whether the insights provide a nuanced understanding of the data, explore underlying patterns, or reveal unexpected findings.
2. Relevance to Goal: Assess how directly each insight addresses the stated goal. Evaluate how well each insight aligns with the goal and consider whether the insight provides actionable recommendations or strategies that directly address the goal. Also, evaluate whether the insights directly contribute to achieving the stated goal.
3. Persona Consistency: Consider how well each insight aligns with the persona's values, goals, and characteristics. Evaluate whether the tone, language, and approach used in each insight align with the persona's stated experience and expertise. Also, assess whether the insights are engaging and relatable to the persona.
4. Coherence: Evaluate how coherent and cohesive is the analysis. Assess whether the insight presents information in a logical flow, makes clear connections between points, and avoids unnecessary jargon or complexity.
5. Answers Question Adequately: Ensure that the insight fully answers the question, addressing all aspects and providing a comprehensive answer. Consider whether the insight provides additional relevant information that goes beyond the scope of the question and provides additional insights or information that could be helpful to the user.
6. Plot Conclusion: Look for a clear and concise conclusion that summarizes the key points of the analysis and clearly states the final decision or recommendation. Evaluate whether the conclusion provides a satisfying or insightful end to the analysis, provides a clear summary of the key points, ties up all loose ends, and provides a sense of closure.
For each criterion, respond with "A is better", "B is better", "Tie", or "None".
Give the response in the form of a python dictionary with keys depth_of_analysis, relevance_to_goal, persona_consistency, coherence, answers_question_adequately, plot_conclusion. Additionally, provide a brief explanation for each score, explaining why you chose a particular score for each criterion, and provide specific examples from the insights to support your scoring decisions.
sample_response: {{
"depth_of_analysis": "A is better",
"relevance_to_goal": "Tie",
"persona_consistency": "Tie",
"coherence": "Tie",
"answers_question_adequately": "B is better",
"plot_conclusion": "B is better",
"depth_of_analysis_explanation": "Insight A provides more detailed statistical analysis with specific percentages and explores multiple factors affecting the outcome",
"relevance_to_goal_explanation": "Both insights address the main objective equally well by identifying key patterns in the data",
"persona_consistency_explanation": "Both insights maintain a consistent analytical tone appropriate for the target audience",
"coherence_explanation": "Both insights present information in a logical flow with clear connections between points",
"answers_question_adequately_explanation": "Insight B provides more comprehensive coverage of all aspects mentioned in the question",
"plot_conclusion_explanation": "Insight B offers a more concise and clear summary of the key trends shown in the visualization"
}}
Goal:
{goal}
Persona: {persona}
Model A (With Skills):
Insight: {with_skills_insight}
Plot: Image 1
Model B (Without Skills):
Insight: {without_skills_insight}
Plot: Image 2
"""
# Get response from LLM
try:
response = client.chat.completions.create(
model="gpt-4o", # or your preferred model
messages=[{"role": "user",
"content": [{"type":"text","text":evaluation_prompt},
{"type": "image_url", "image_url": {"url":f"data:image/png;base64,{with_skills_image}"}},
{"type": "image_url", "image_url": {"url":f"data:image/png;base64,{without_skills_image}"}}
]}],
temperature=0.0,
)
llm_response = response.choices[0].message.content
except Exception as e:
raise Exception(f"Error getting LLM response: {str(e)}")
return llm_response, False
def get_all_dataset_ids(with_skills_hash, without_skills_hash):
"""
Get all dataset IDs that exist in both with_skills and without_skills directories.
Only returns IDs that are purely numerical.
"""
with_skills_path = os.path.join("results/insights_w_skills", with_skills_hash)
without_skills_path = os.path.join("results/insights_wo_skills", without_skills_hash)
# Get datasets that exist in both directories
with_skills_datasets = (
set(os.listdir(with_skills_path)) if os.path.exists(with_skills_path) else set()
)
without_skills_datasets = (
set(os.listdir(without_skills_path))
if os.path.exists(without_skills_path)
else set()
)
# Filter for numerical-only IDs
common_datasets = with_skills_datasets.intersection(without_skills_datasets)
numerical_datasets = {
dataset_id for dataset_id in common_datasets if dataset_id.isdigit()
}
return list(numerical_datasets)
def get_results(dataset_id, with_skills_hash, without_skills_hash):
"""Process one dataset: evaluates insights and writes the LLM response."""
print(f"\nProcessing dataset: {dataset_id}")
# Call the evaluation function for the dataset
llm_response, error = evaluate_insights_comparison(
dataset_id, with_skills_hash, without_skills_hash
)
if error:
print(f"There was an error processing {dataset_id}: {llm_response}")
return (dataset_id, False)
# Create the output directory if it doesn't exist
output_dir = f"results/evaluations/GPTEval/pilot/{dataset_id}"
os.makedirs(output_dir, exist_ok=True)
# Save the LLM response to file
llm_response_path = os.path.join(output_dir, "llm_response.txt")
with open(llm_response_path, "w") as f:
f.write(llm_response)
print(f"LLM response saved to: {llm_response_path}")
return (dataset_id, True)
def main():
# Example usage results/insights_w_skills/Superbatch1
with_skills_hash = "Superbatch1"
without_skills_hash = "Superbatch1"
try:
dataset_ids = get_all_dataset_ids(with_skills_hash, without_skills_hash)
# dataset_ids = list(range(50, 52))
if not dataset_ids:
raise Exception("No matching datasets found in both directories")
tasks = [
(dataset_id, with_skills_hash, without_skills_hash)
for dataset_id in dataset_ids
]
with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
results = pool.starmap(get_results, tasks)
except Exception as e:
print(f"Error: {str(e)}")
if __name__ == "__main__":
main()