Skip to content

Commit 36b57ff

Browse files
Athena: Implement verification tests for basic approach of module_text_llm (#113)
1 parent 867683f commit 36b57ff

File tree

5 files changed

+220
-11
lines changed

5 files changed

+220
-11
lines changed

athena/scripts/test_modules.py

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
import subprocess
22
import os
33
import sys
4+
import argparse
45

56

67
def main():
8+
parser = argparse.ArgumentParser(description='Run tests for Athena modules')
9+
parser.add_argument('--include-real', action='store_true',
10+
help='Include real tests in addition to mock tests')
11+
args = parser.parse_args()
12+
713
modules = [
814
"docs",
915
"log_viewer",
@@ -43,19 +49,47 @@ def main():
4349
print(f"Using Python path: {python_path}")
4450

4551
try:
46-
# Install pytest in the virtual environment
47-
print(f"Installing pytest for {module}...")
48-
subprocess.run([pip_path, "install", "pytest"], check=True, capture_output=True, text=True)
52+
# Install pytest and pytest-asyncio in the virtual environment
53+
print(f"Installing pytest and pytest-asyncio for {module}...")
54+
subprocess.run([pip_path, "install", "pytest", "pytest-asyncio"], check=True, capture_output=True, text=True)
4955

50-
# Run pytest using the module's virtual environment
51-
result = subprocess.run([python_path, "-m", "pytest", test_dir], capture_output=True, text=True)
52-
if result.returncode != 0:
53-
print(f"Tests failed for {module}:")
54-
print(result.stdout)
55-
print(result.stderr)
56-
success = False
56+
# Run pytest using the module's virtual environment, only running tests from mock directories
57+
mock_test_dir = os.path.join(test_dir, "mock")
58+
if os.path.exists(mock_test_dir):
59+
print(f"\nRunning mock tests for {module}...")
60+
result = subprocess.run([python_path, "-m", "pytest", mock_test_dir, "-v"], check=False)
61+
if result.returncode != 0:
62+
print(f"\nMock tests failed for {module}")
63+
success = False
64+
else:
65+
print(f"\nMock tests passed for {module}")
5766
else:
58-
print(f"Tests passed for {module}")
67+
print(f"No mock tests found for {module}, skipping...")
68+
69+
# Run real tests if requested and if module has real tests
70+
if args.include_real:
71+
real_test_dir = os.path.join(test_dir, "real")
72+
if os.path.exists(real_test_dir):
73+
# Change to the module directory for real tests
74+
module_dir = os.path.join(os.getcwd(), module)
75+
if os.path.exists(module_dir):
76+
original_dir = os.getcwd()
77+
os.chdir(module_dir)
78+
print(f"\nRunning real tests from {module_dir}...")
79+
# Run pytest with the real test directory as the test path
80+
result = subprocess.run([python_path, "-m", "pytest", '../../../'+real_test_dir, "-v"], check=False)
81+
if result.returncode != 0:
82+
print(f"\nReal tests failed for {module}")
83+
success = False
84+
else:
85+
print(f"\nReal tests passed for {module}")
86+
# Change back to original directory
87+
os.chdir(original_dir)
88+
else:
89+
print(f"Module directory not found for {module}, skipping real tests...")
90+
else:
91+
print(f"No real tests found for {module}, skipping...")
92+
5993
except Exception as e:
6094
print(f"Error running tests for {module}: {str(e)}")
6195
success = False
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import os
2+
import pytest
3+
import nltk
4+
import asyncio
5+
from module_text_llm.basic_approach import BasicApproachConfig
6+
from llm_core.models.openai import OpenAIModelConfig
7+
8+
9+
@pytest.fixture(scope="session", autouse=True)
10+
def setup_environment():
11+
12+
nltk.download('punkt', quiet=True)
13+
nltk.download('punkt_tab', quiet=True)
14+
15+
@pytest.fixture(scope="session")
16+
def event_loop():
17+
"""Create an instance of the default event loop for each test case."""
18+
loop = asyncio.get_event_loop_policy().new_event_loop()
19+
yield loop
20+
loop.close()
21+
22+
@pytest.fixture
23+
def real_config():
24+
"""Create a real configuration for testing with Azure OpenAI."""
25+
return BasicApproachConfig(
26+
max_input_tokens=5000,
27+
model=OpenAIModelConfig(
28+
model_name="azure_openai_gpt-4o",
29+
get_model=lambda: None # This will be set by the module
30+
),
31+
type="basic"
32+
)
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
import pytest
2+
from athena.text import Exercise, Submission, Feedback
3+
from athena.schemas.exercise_type import ExerciseType
4+
5+
@pytest.mark.asyncio
6+
async def test_generate_suggestions_algorithm_explanation(real_config):
7+
"""Test feedback generation for explaining an algorithm."""
8+
exercise = Exercise(
9+
id=1,
10+
title="Algorithm Explanation Exercise",
11+
type=ExerciseType.text,
12+
max_points=10,
13+
bonus_points=2,
14+
grading_instructions="Explain the algorithm clearly, including its time complexity and space complexity.",
15+
problem_statement="Explain how the binary search algorithm works. Include its time complexity and when it should be used.",
16+
example_solution="Binary search is an efficient algorithm for finding an element in a sorted array. It works by repeatedly dividing the search interval in half. If the value of the search key is less than the item in the middle of the interval, narrow the interval to the lower half. Otherwise, narrow it to the upper half. The time complexity is O(log n) because we divide the search space in half each time. Space complexity is O(1) as we only use a constant amount of extra space.",
17+
grading_criteria=[]
18+
)
19+
20+
submission = Submission(
21+
id=1,
22+
exerciseId=exercise.id,
23+
text="Binary search is when you look for something in a sorted list by checking the middle element. If it's not there, you look in the left or right half. It's pretty fast."
24+
)
25+
26+
feedbacks = await real_config.generate_suggestions(
27+
exercise=exercise,
28+
submission=submission,
29+
config=real_config,
30+
debug=False,
31+
is_graded=True
32+
)
33+
34+
for feedback in feedbacks:
35+
print(feedback.description)
36+
print("--------------------------------")
37+
38+
assert isinstance(feedbacks, list)
39+
assert len(feedbacks) > 0, "Expected feedback about algorithm explanation"
40+
41+
# Combine all feedback for analysis
42+
all_feedback = " ".join(f.description.lower() for f in feedbacks)
43+
44+
# Technical Accuracy Checks - must include at least half of the terms
45+
required_complexity_terms = ["time complexity", "o(log n)", "space complexity", "o(1)"]
46+
found_complexity = [term for term in required_complexity_terms if term in all_feedback]
47+
min_required = len(required_complexity_terms) // 2
48+
assert len(found_complexity) >= min_required, f"Feedback must include at least {min_required} complexity terms. Found: {', '.join(found_complexity)}"
49+
50+
required_algorithm_terms = ["sorted", "interval", "element"]
51+
found_algorithm = [term for term in required_algorithm_terms if term in all_feedback]
52+
min_required = len(required_algorithm_terms) // 2
53+
assert len(found_algorithm) >= min_required, f"Feedback must include at least {min_required} algorithm terms. Found: {', '.join(found_algorithm)}"
54+
55+
@pytest.mark.asyncio
56+
async def test_generate_suggestions_code_documentation(real_config):
57+
"""Test feedback generation for code documentation exercise."""
58+
exercise = Exercise(
59+
id=2,
60+
title="Code Documentation Exercise",
61+
type=ExerciseType.text,
62+
max_points=10,
63+
bonus_points=2,
64+
grading_instructions="Document the code's purpose, parameters, return values, and any important notes about usage.",
65+
problem_statement="Write documentation for a function that calculates the factorial of a number. Include its purpose, parameters, return value, and any edge cases to consider.",
66+
example_solution="This function calculates the factorial of a non-negative integer n. Parameters: n (int) - The number to calculate factorial for. Returns: int - The factorial of n. Note: This function will raise a ValueError if n is negative. The factorial of 0 is defined as 1.",
67+
grading_criteria=[]
68+
)
69+
70+
submission = Submission(
71+
id=2,
72+
exerciseId=exercise.id,
73+
text="This function finds the factorial. It takes a number and multiplies it by all numbers below it until it reaches 1."
74+
)
75+
76+
feedbacks = await real_config.generate_suggestions(
77+
exercise=exercise,
78+
submission=submission,
79+
config=real_config,
80+
debug=False,
81+
is_graded=True
82+
)
83+
84+
for feedback in feedbacks:
85+
print(feedback.description)
86+
print("--------------------------------")
87+
88+
assert isinstance(feedbacks, list)
89+
assert len(feedbacks) > 0, "Expected feedback about documentation"
90+
91+
# Combine all feedback for analysis
92+
all_feedback = " ".join(f.description.lower() for f in feedbacks)
93+
94+
# Documentation Requirements - must include at least half of the terms
95+
required_doc_terms = ["parameter", "return", "edge case", "negative", "0"]
96+
found_doc = [term for term in required_doc_terms if term in all_feedback]
97+
min_required = len(required_doc_terms) // 2
98+
assert len(found_doc) >= min_required, f"Feedback must include at least {min_required} documentation terms. Found: {', '.join(found_doc)}"
99+
100+
@pytest.mark.asyncio
101+
async def test_generate_suggestions_design_pattern(real_config):
102+
"""Test feedback generation for explaining a design pattern."""
103+
exercise = Exercise(
104+
id=3,
105+
title="Design Pattern Explanation Exercise",
106+
type=ExerciseType.text,
107+
max_points=10,
108+
bonus_points=2,
109+
grading_instructions="Explain the design pattern, its use cases, advantages, and disadvantages.",
110+
problem_statement="Explain the Singleton design pattern. Include when it should be used and its potential drawbacks.",
111+
example_solution="The Singleton pattern ensures a class has only one instance and provides a global point of access to it. It's useful when exactly one object is needed to coordinate actions across the system. Advantages include controlled access to the sole instance and reduced namespace pollution. Disadvantages include potential violation of the Single Responsibility Principle and difficulty in unit testing due to global state.",
112+
grading_criteria=[]
113+
)
114+
115+
submission = Submission(
116+
id=3,
117+
exerciseId=exercise.id,
118+
text="Singleton is when you make sure there's only one copy of something in your program. It's good for saving memory."
119+
)
120+
121+
feedbacks = await real_config.generate_suggestions(
122+
exercise=exercise,
123+
submission=submission,
124+
config=real_config,
125+
debug=False,
126+
is_graded=True
127+
)
128+
129+
for feedback in feedbacks:
130+
print(feedback.description)
131+
print("--------------------------------")
132+
133+
assert isinstance(feedbacks, list)
134+
assert len(feedbacks) > 0, "Expected feedback about design pattern explanation"
135+
136+
# Combine all feedback for analysis
137+
all_feedback = " ".join(f.description.lower() for f in feedbacks)
138+
139+
# Design Pattern Requirements - must include at least half of the terms
140+
required_pattern_terms = ["instance", "advantage", "drawback", "use"]
141+
found_pattern = [term for term in required_pattern_terms if term in all_feedback]
142+
min_required = len(required_pattern_terms) // 2
143+
assert len(found_pattern) >= min_required, f"Feedback must include at least {min_required} design pattern terms. Found: {', '.join(found_pattern)}"

0 commit comments

Comments
 (0)