Coverage for src/ollamapy/vibe_tests.py: 12%
275 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-01 12:29 -0400
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-01 12:29 -0400
1"""Built-in vibe tests for evaluating AI decision-making consistency with timing analysis and visual reporting."""
3import re
4import time
5import statistics
6from typing import List, Dict, Tuple, Any
7from .ollama_client import OllamaClient
8from .model_manager import ModelManager
9from .analysis_engine import AnalysisEngine
10from .skills import get_actions_with_vibe_tests, clear_action_logs
11from .vibe_report import VibeTestReportGenerator
14class TimingStats:
15 """Container for timing statistics with helpful analysis methods."""
17 def __init__(self, times: List[float]):
18 """Initialize timing stats from a list of execution times.
20 Args:
21 times: List of execution times in seconds
22 """
23 self.times = times
24 self.count = len(times)
26 if times:
27 self.mean = statistics.mean(times)
28 self.median = statistics.median(times)
29 self.min = min(times)
30 self.max = max(times)
31 self.std_dev = statistics.stdev(times) if len(times) > 1 else 0.0
33 # Calculate percentiles for more insight
34 sorted_times = sorted(times)
35 self.p25 = self._percentile(sorted_times, 25)
36 self.p75 = self._percentile(sorted_times, 75)
37 self.p95 = self._percentile(sorted_times, 95)
38 else:
39 self.mean = self.median = self.min = self.max = self.std_dev = 0.0
40 self.p25 = self.p75 = self.p95 = 0.0
42 def _percentile(self, sorted_data: List[float], percentile: float) -> float:
43 """Calculate percentile from sorted data."""
44 if not sorted_data:
45 return 0.0
47 k = (len(sorted_data) - 1) * (percentile / 100.0)
48 f = int(k)
49 c = k - f
51 if f + 1 < len(sorted_data):
52 return sorted_data[f] + c * (sorted_data[f + 1] - sorted_data[f])
53 else:
54 return sorted_data[f]
56 @property
57 def consistency_score(self) -> float:
58 """Calculate a consistency score (0-100) based on timing variability.
60 Lower variance relative to mean indicates higher consistency.
62 Returns:
63 Consistency score from 0 (very inconsistent) to 100 (very consistent)
64 """
65 if self.mean == 0 or self.count < 2:
66 return 100.0
68 # Coefficient of variation (CV) = std_dev / mean
69 cv = self.std_dev / self.mean
71 # Convert CV to consistency score (lower CV = higher consistency)
72 # CV of 0.1 (10%) = 90 consistency, CV of 0.5 (50%) = 50 consistency
73 consistency = max(0, 100 - (cv * 200))
74 return min(100, consistency)
76 @property
77 def performance_category(self) -> str:
78 """Categorize performance based on mean response time."""
79 if self.mean < 1.0:
80 return "Very Fast"
81 elif self.mean < 2.0:
82 return "Fast"
83 elif self.mean < 5.0:
84 return "Moderate"
85 elif self.mean < 10.0:
86 return "Slow"
87 else:
88 return "Very Slow"
90 def to_dict(self) -> Dict[str, Any]:
91 """Convert timing stats to dictionary for serialization."""
92 return {
93 "count": self.count,
94 "mean": self.mean,
95 "median": self.median,
96 "min": self.min,
97 "max": self.max,
98 "std_dev": self.std_dev,
99 "p25": self.p25,
100 "p75": self.p75,
101 "p95": self.p95,
102 "consistency_score": self.consistency_score,
103 "performance_category": self.performance_category,
104 "raw_times": self.times,
105 }
108class VibeTestRunner:
109 """Built-in vibe test runner with multi-action support, timing analysis, and visual reporting.
111 Tests check if the target action is selected, regardless of what other
112 actions might also be selected, and generates comprehensive visual reports
113 including timing analysis.
114 """
116 def __init__(self, model: str = "gemma3:4b", analysis_model: str = "gemma3:4b"):
117 """Initialize the vibe test runner.
119 Args:
120 model: The model to use for testing
121 analysis_model: Optional separate model for action analysis (defaults to main model)
122 """
123 self.model = model
124 self.analysis_model = analysis_model or model
125 self.client = OllamaClient()
126 self.model_manager = ModelManager(self.client)
127 self.analysis_engine = AnalysisEngine(self.analysis_model, self.client)
128 self.actions_with_tests = get_actions_with_vibe_tests()
129 self.all_test_results = {} # Store all results for report generation
131 def check_prerequisites(self) -> bool:
132 """Check if Ollama is available and models can be used."""
133 success, main_status, analysis_status = (
134 self.model_manager.ensure_models_available(self.model, self.analysis_model)
135 )
137 if not success:
138 print("❌ Error: Ollama server is not running!")
139 print("Please start Ollama with: ollama serve")
140 return False
142 return True
144 def extract_expected_parameters(
145 self, phrase: str, action_name: str
146 ) -> Dict[str, Any]:
147 """Extract expected parameter values from test phrases.
149 Args:
150 phrase: The test phrase
151 action_name: The action being tested
153 Returns:
154 Dictionary of expected parameter values
155 """
156 expected_params = {}
158 # Extract numbers for square_root
159 if action_name == "square_root":
160 # Look for numbers in the phrase
161 numbers = re.findall(r"\d+(?:\.\d+)?", phrase)
162 if numbers:
163 expected_params["number"] = float(numbers[0])
165 # Extract expressions for calculate
166 elif action_name == "calculate":
167 # Look for mathematical expressions
168 # Simple pattern for basic arithmetic
169 expr_match = re.search(r"(\d+\s*[+\-*/]\s*\d+)", phrase)
170 if expr_match:
171 expected_params["expression"] = expr_match.group(1).replace(" ", "")
173 # Extract location for weather (if mentioned)
174 elif action_name == "getWeather":
175 # Look for common city names or location indicators
176 location_keywords = ["in", "at", "for"]
177 for keyword in location_keywords:
178 if keyword in phrase.lower():
179 parts = phrase.lower().split(keyword)
180 if len(parts) > 1:
181 potential_location_parts = parts[1].strip().split()
182 if potential_location_parts:
183 potential_location = potential_location_parts[0]
184 if len(potential_location) > 2:
185 expected_params["location"] = potential_location
186 break
188 return expected_params
190 def time_analysis_execution(
191 self, phrase: str
192 ) -> Tuple[List[Tuple[str, Dict[str, Any]]], float]:
193 """Time the execution of analysis engine action selection.
195 Args:
196 phrase: The phrase to analyze
198 Returns:
199 Tuple of (selected_actions, execution_time_seconds)
200 """
201 start_time = time.perf_counter()
203 try:
204 # Clear any previous logs
205 clear_action_logs()
207 # Run the multi-action analysis
208 selected_actions = self.analysis_engine.select_all_applicable_actions(
209 phrase
210 )
212 end_time = time.perf_counter()
213 execution_time = end_time - start_time
215 return selected_actions, execution_time
217 except Exception as e:
218 end_time = time.perf_counter()
219 execution_time = end_time - start_time
220 print(f"❌ Error during analysis timing: {e}")
221 return [], execution_time
223 def run_action_test(
224 self, action_name: str, action_info: Dict, phrases: List[str], iterations: int
225 ) -> Tuple[bool, Dict]:
226 """Run a test on a specific action with its phrases, including timing analysis.
228 Tests if the target action is selected (other actions may also be selected).
230 Args:
231 action_name: Name of the action being tested
232 action_info: Information about the action (description, etc.)
233 phrases: List of test phrases for this action
234 iterations: Number of times to test each phrase
236 Returns:
237 Tuple of (success: bool, results: dict)
238 """
239 total_correct = 0
240 total_tests = 0
241 results = {}
243 print(f"\n🧪 {action_name} Action Test")
244 print(f"Chat Model: {self.model}")
245 if self.analysis_model != self.model:
246 print(f"Analysis Model: {self.analysis_model}")
247 else:
248 print("Using same model for analysis and chat")
249 print("Mode: Multi-action selection (target action must be selected)")
250 print("=" * 80)
252 for phrase in phrases:
253 phrase_correct = 0
254 parameter_correct = 0
255 expected_params = self.extract_expected_parameters(phrase, action_name)
257 # Track secondary actions and timing per iteration for this phrase
258 secondary_actions_per_iteration = []
259 execution_times = []
261 for i in range(iterations):
262 try:
263 # Time the analysis execution
264 selected_actions, execution_time = self.time_analysis_execution(
265 phrase
266 )
267 execution_times.append(execution_time)
269 # Check if target action was selected and track secondary actions
270 action_found = False
271 params_match = False
272 iteration_secondary_actions = []
274 for selected_action, parameters in selected_actions:
275 if selected_action == action_name:
276 action_found = True
277 phrase_correct += 1
279 # Check parameters if expected
280 if expected_params:
281 params_match = True
282 for (
283 param_name,
284 expected_value,
285 ) in expected_params.items():
286 if param_name in parameters:
287 actual_value = parameters[param_name]
288 # For numbers, check if they're close enough
289 if isinstance(expected_value, (int, float)):
290 try:
291 actual_float = float(actual_value)
292 if (
293 abs(actual_float - expected_value)
294 < 0.001
295 ):
296 parameter_correct += 1
297 else:
298 params_match = False
299 except:
300 params_match = False
301 # For strings, check exact match
302 elif str(actual_value) == str(expected_value):
303 parameter_correct += 1
304 else:
305 params_match = False
306 else:
307 params_match = False
308 else:
309 # This is a secondary action
310 iteration_secondary_actions.append(selected_action)
312 secondary_actions_per_iteration.append(iteration_secondary_actions)
313 total_tests += 1
315 except Exception as e:
316 print(f"❌ Error testing phrase iteration {i+1}: {e}")
317 secondary_actions_per_iteration.append([])
318 # Still record the time even if there was an error
319 if len(execution_times) <= i:
320 execution_times.append(0.0)
321 continue
323 # Calculate secondary action frequencies
324 secondary_action_counts = {}
325 for iteration_actions in secondary_actions_per_iteration:
326 for action in iteration_actions:
327 secondary_action_counts[action] = (
328 secondary_action_counts.get(action, 0) + 1
329 )
331 # Calculate timing statistics
332 timing_stats = TimingStats(execution_times)
334 success_rate = (phrase_correct / iterations) * 100 if iterations > 0 else 0
335 param_success_rate = (
336 (parameter_correct / iterations) * 100
337 if iterations > 0 and expected_params
338 else 100
339 )
341 results[phrase] = {
342 "correct": phrase_correct,
343 "total": iterations,
344 "success_rate": success_rate,
345 "parameter_success_rate": param_success_rate,
346 "expected_params": expected_params,
347 "secondary_action_counts": secondary_action_counts,
348 "secondary_actions_per_iteration": secondary_actions_per_iteration,
349 "timing_stats": timing_stats.to_dict(),
350 }
351 total_correct += phrase_correct
353 # Print individual results with timing
354 phrase_display = phrase[:50] + "..." if len(phrase) > 50 else phrase
355 print(f"Phrase: '{phrase_display}'")
356 print(
357 f"Target Action Selected: {phrase_correct}/{iterations} ({success_rate:.1f}%)"
358 )
359 if expected_params:
360 print(
361 f"Parameter Success: {parameter_correct}/{iterations} ({param_success_rate:.1f}%)"
362 )
363 print(f"Expected params: {expected_params}")
365 # Print timing analysis
366 print(f"Timing Analysis:")
367 print(
368 f" Average: {timing_stats.mean:.2f}s | Median: {timing_stats.median:.2f}s"
369 )
370 print(f" Range: {timing_stats.min:.2f}s - {timing_stats.max:.2f}s")
371 print(f" Performance: {timing_stats.performance_category}")
372 print(f" Consistency: {timing_stats.consistency_score:.1f}/100")
374 if secondary_action_counts:
375 print(f"Secondary actions triggered:")
376 for action, count in secondary_action_counts.items():
377 print(f" - {action}: {count}/{iterations} times")
378 print("-" * 40)
380 overall_success_rate = (
381 (total_correct / total_tests) * 100 if total_tests > 0 else 0
382 )
384 # Calculate overall timing statistics across all phrases
385 all_times = []
386 for phrase_results in results.values():
387 all_times.extend(phrase_results["timing_stats"]["raw_times"])
388 overall_timing = TimingStats(all_times)
390 print(
391 f"Overall Success Rate: {total_correct}/{total_tests} ({overall_success_rate:.1f}%)"
392 )
393 print(
394 f"Overall Timing: {overall_timing.mean:.2f}s avg, {overall_timing.performance_category}, {overall_timing.consistency_score:.1f}/100 consistency"
395 )
397 test_passed = overall_success_rate >= 60.0
398 return test_passed, {
399 "action_name": action_name,
400 "action_description": action_info.get("description", "No description"),
401 "total_correct": total_correct,
402 "total_tests": total_tests,
403 "success_rate": overall_success_rate,
404 "phrase_results": results,
405 "overall_timing_stats": overall_timing.to_dict(),
406 }
408 def run_all_tests(self, iterations: int = 1) -> bool:
409 """Run all vibe tests for all actions that have test phrases.
411 Args:
412 iterations: Number of iterations per phrase
414 Returns:
415 True if all tests passed, False otherwise
416 """
417 print(
418 f"🧪 Running vibe tests with multi-action support, timing analysis, and visual reporting"
419 )
420 print(f"Chat model: {self.model}")
421 if self.analysis_model != self.model:
422 print(f"Analysis model: {self.analysis_model}")
423 else:
424 print("Using same model for analysis and chat")
425 print(f"Analysis mode: Multi-action (target must be selected)")
426 print(f"Iterations: {iterations}")
427 print("=" * 80)
429 # Check prerequisites
430 if not self.check_prerequisites():
431 return False
433 print(f"✅ Using chat model: {self.model}")
434 if self.analysis_model != self.model:
435 print(f"✅ Using analysis model: {self.analysis_model}")
436 print(
437 f"🧠 Testing AI's ability to select appropriate actions (multiple allowed)..."
438 )
439 print(f"⏱️ Including timing analysis for performance insights...")
440 print(
441 f"📋 Found {len(self.actions_with_tests)} actions with vibe test phrases\n"
442 )
444 if not self.actions_with_tests:
445 print("❌ No actions with vibe test phrases found!")
446 return False
448 # Run tests for each action
449 test_results = {}
450 all_tests_passed = True
451 overall_test_start = time.perf_counter()
453 for action_name, action_info in self.actions_with_tests.items():
454 test_phrases = action_info["vibe_test_phrases"]
456 if not test_phrases:
457 print(f"⚠️ Skipping {action_name} - no test phrases defined")
458 continue
460 test_passed, results = self.run_action_test(
461 action_name, action_info, test_phrases, iterations
462 )
464 test_results[action_name] = {"passed": test_passed, "results": results}
466 if not test_passed:
467 all_tests_passed = False
469 overall_test_time = time.perf_counter() - overall_test_start
471 # Store results for report generation
472 self.all_test_results = test_results
474 # Generate and save the HTML report using the report generator
475 report_generator = VibeTestReportGenerator(self.model, self.analysis_model)
476 filename = report_generator.save_report(test_results)
477 print(f"\n📊 Report saved to: {filename}")
478 print(
479 f" Open in your browser to view interactive charts with timing analysis"
480 )
482 # Final results summary with timing
483 print(f"\n📊 Final Test Results:")
484 print("=" * 50)
486 fastest_action = None
487 slowest_action = None
488 fastest_time = float("inf")
489 slowest_time = 0.0
491 for action_name, test_data in test_results.items():
492 status_icon = "✅ PASSED" if test_data["passed"] else "❌ FAILED"
493 success_rate = test_data["results"]["success_rate"]
494 avg_time = test_data["results"]["overall_timing_stats"]["mean"]
495 consistency = test_data["results"]["overall_timing_stats"][
496 "consistency_score"
497 ]
499 print(f"{action_name} Action Test: {status_icon} ({success_rate:.1f}%)")
500 print(
501 f" Performance: {avg_time:.2f}s avg, {consistency:.1f}/100 consistency"
502 )
504 if avg_time < fastest_time:
505 fastest_time = avg_time
506 fastest_action = action_name
507 if avg_time > slowest_time:
508 slowest_time = avg_time
509 slowest_action = action_name
511 status_icon = "✅" if all_tests_passed else "❌"
512 status_text = "ALL TESTS PASSED" if all_tests_passed else "SOME TESTS FAILED"
513 print(f"\nOverall Result: {status_icon} {status_text}")
514 print(f"Total Test Duration: {overall_test_time:.2f}s")
516 if fastest_action and slowest_action:
517 print(
518 f"Performance Range: {fastest_action} ({fastest_time:.2f}s) → {slowest_action} ({slowest_time:.2f}s)"
519 )
521 if not all_tests_passed:
522 print("\n💡 Tips for improving results:")
523 print(" • Try a different model with --model")
524 print(" • Try a different analysis model with --analysis-model")
525 print(" • Use a smaller, faster model for analysis (e.g., gemma2:2b)")
526 print(" • Increase iterations with -n for better statistics")
527 print(" • Ensure Ollama server is running optimally")
528 print(" • Check action descriptions and test phrases for clarity")
530 return all_tests_passed
532 def run_quick_test(self) -> bool:
533 """Run a quick single-iteration test for fast feedback."""
534 print("🚀 Running quick vibe test (1 iteration each)...")
535 return self.run_all_tests(iterations=1)
537 def run_statistical_test(self, iterations: int = 5) -> bool:
538 """Run a statistical test with multiple iterations."""
539 print(f"📊 Running statistical vibe test ({iterations} iterations each)...")
540 return self.run_all_tests(iterations=iterations)
543def run_vibe_tests(
544 model: str = "gemma3:4b", iterations: int = 1, analysis_model: str = None
545) -> bool:
546 """Convenience function to run vibe tests with timing analysis and visual reporting.
548 Args:
549 model: The model to use for testing
550 iterations: Number of iterations per test
551 analysis_model: Optional separate model for action analysis (defaults to main model)
553 Returns:
554 True if all tests passed, False otherwise
555 """
556 runner = VibeTestRunner(model=model, analysis_model=analysis_model)
557 return runner.run_all_tests(iterations=iterations)