Coverage for src/ollamapy/multi_model_vibe_tests.py: 0%
153 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-01 12:29 -0400
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-01 12:29 -0400
1"""Multi-model vibe test runner for comprehensive model comparison and GitHub Pages integration."""
3import json
4import time
5import os
6from pathlib import Path
7from typing import Dict, List, Any, Tuple, Optional
8from datetime import datetime
10from .vibe_tests import VibeTestRunner, TimingStats
11from .ollama_client import OllamaClient
14class MultiModelVibeTestRunner:
15 """Runs vibe tests across multiple models and generates comprehensive reports for GitHub Pages."""
17 def __init__(self, config_path: Optional[str] = None):
18 """Initialize the multi-model vibe test runner.
20 Args:
21 config_path: Path to the model configuration file
22 """
23 if config_path is None:
24 # Default to config/vibe_test_models.json
25 project_root = Path(__file__).parent.parent.parent
26 config_path = project_root / "config" / "vibe_test_models.json"
28 self.config_path = Path(config_path)
29 self.config = self._load_config()
30 self.client = OllamaClient()
31 self.all_results = {}
33 def _load_config(self) -> Dict[str, Any]:
34 """Load the model configuration."""
35 try:
36 with open(self.config_path, "r") as f:
37 return json.load(f)
38 except FileNotFoundError:
39 print(f"❌ Configuration file not found: {self.config_path}")
40 return self._get_default_config()
41 except json.JSONDecodeError as e:
42 print(f"❌ Invalid JSON in configuration file: {e}")
43 return self._get_default_config()
45 def _get_default_config(self) -> Dict[str, Any]:
46 """Get default configuration if config file is missing."""
47 return {
48 "models": [
49 {
50 "name": "gemma3:4b",
51 "display_name": "Gemma 3 4B",
52 "description": "Compact 4B parameter model optimized for speed",
53 }
54 ],
55 "test_config": {
56 "iterations": 5,
57 "timeout": 120,
58 "collect_runtime_stats": True,
59 "include_performance_metrics": True,
60 },
61 }
63 def check_model_availability(self, model_name: str, timeout: int = 60) -> bool:
64 """Check if a model is available in Ollama.
66 Args:
67 model_name: Name of the model to check
68 timeout: Timeout in seconds for the availability check
69 """
70 try:
71 # Try to generate a simple response to test availability with timeout
72 import signal
74 def timeout_handler(signum, frame):
75 raise TimeoutError(
76 f"Model availability check timed out after {timeout}s"
77 )
79 # Set timeout for the check
80 signal.signal(signal.SIGALRM, timeout_handler)
81 signal.alarm(timeout)
83 try:
84 response = self.client.generate(model=model_name, prompt="Hello")
85 signal.alarm(0) # Cancel timeout
86 return response is not None
87 finally:
88 signal.alarm(0) # Ensure timeout is cancelled
90 except TimeoutError as e:
91 print(f"❌ Model {model_name} availability check timed out: {e}")
92 return False
93 except Exception as e:
94 print(f"❌ Model {model_name} not available: {e}")
95 return False
97 def run_tests_for_model(
98 self, model_config: Dict[str, str], iterations: int
99 ) -> Dict[str, Any]:
100 """Run vibe tests for a single model.
102 Args:
103 model_config: Model configuration dictionary
104 iterations: Number of iterations per test
106 Returns:
107 Dictionary containing test results and metadata
108 """
109 model_name = model_config["name"]
110 print(f"\n🧪 Testing Model: {model_config['display_name']} ({model_name})")
111 print(f"📄 Description: {model_config['description']}")
112 print("=" * 80)
114 start_time = time.perf_counter()
116 # Create a vibe test runner for this model
117 # Use the same model for both chat and analysis to get pure model performance
118 runner = VibeTestRunner(model=model_name, analysis_model=model_name)
120 # Run the tests
121 success = runner.run_all_tests(iterations=iterations)
123 end_time = time.perf_counter()
124 total_runtime = end_time - start_time
126 # Get the detailed results from the runner
127 detailed_results = runner.all_test_results
129 # Aggregate statistics
130 total_tests = sum(
131 result["results"]["total_tests"] for result in detailed_results.values()
132 )
133 total_correct = sum(
134 result["results"]["total_correct"] for result in detailed_results.values()
135 )
136 overall_success_rate = (
137 (total_correct / total_tests * 100) if total_tests > 0 else 0
138 )
140 # Calculate overall timing statistics
141 all_times = []
142 for result in detailed_results.values():
143 all_times.extend(result["results"]["overall_timing_stats"]["raw_times"])
145 overall_timing = TimingStats(all_times) if all_times else TimingStats([])
147 return {
148 "model_config": model_config,
149 "success": success,
150 "total_runtime": total_runtime,
151 "summary": {
152 "total_tests": total_tests,
153 "total_correct": total_correct,
154 "overall_success_rate": overall_success_rate,
155 "overall_timing_stats": overall_timing.to_dict(),
156 },
157 "detailed_results": detailed_results,
158 "timestamp": datetime.now().isoformat(),
159 "iterations": iterations,
160 }
162 def run_all_model_tests(self, iterations: Optional[int] = None) -> bool:
163 """Run vibe tests for all configured models.
165 Args:
166 iterations: Override for number of iterations (uses config default if None)
168 Returns:
169 True if all models passed their tests, False otherwise
170 """
171 if iterations is None:
172 iterations = self.config["test_config"]["iterations"]
174 # Filter only enabled models
175 enabled_models = [m for m in self.config["models"] if m.get("enabled", True)]
176 total_models = len(enabled_models)
177 disabled_count = len(self.config["models"]) - total_models
179 print("🌟 Multi-Model Vibe Test Suite")
180 print(
181 f"📋 Testing {total_models} enabled models with {iterations} iterations each"
182 )
183 if disabled_count > 0:
184 print(f"⚠️ Skipping {disabled_count} disabled models")
185 print(f"📊 Collecting runtime statistics and performance metrics")
186 print("=" * 80)
188 # Check Ollama availability
189 if not self.client.is_available():
190 print(
191 "❌ Ollama server is not available. Please start it with: ollama serve"
192 )
193 return False
195 all_success = True
196 self.all_results = {}
198 for i, model_config in enumerate(enabled_models, 1):
199 model_name = model_config["name"]
200 model_timeout = model_config.get("timeout", 60)
202 print(
203 f"\n[{i}/{total_models}] Preparing to test {model_name}..."
204 )
205 print(f"⏱️ Model timeout: {model_timeout}s")
207 # Check if model is available with model-specific timeout
208 if not self.check_model_availability(model_name, model_timeout):
209 print(f"❌ Skipping {model_name} - not available")
210 continue
212 # Run tests for this model
213 try:
214 results = self.run_tests_for_model(model_config, iterations)
215 self.all_results[model_name] = results
217 if not results["success"]:
218 all_success = False
220 except Exception as e:
221 print(f"❌ Error testing {model_name}: {e}")
222 all_success = False
223 continue
225 # Generate comparison report
226 self._print_comparison_summary()
228 return all_success
230 def _print_comparison_summary(self):
231 """Print a comparison summary of all tested models."""
232 if not self.all_results:
233 return
235 print("\n" + "=" * 80)
236 print("📊 Multi-Model Comparison Summary")
237 print("=" * 80)
239 # Sort models by overall success rate
240 sorted_models = sorted(
241 self.all_results.items(),
242 key=lambda x: x[1]["summary"]["overall_success_rate"],
243 reverse=True,
244 )
246 print(
247 f"{'Model':<20} {'Success Rate':<12} {'Avg Time':<10} {'Consistency':<12} {'Status':<10}"
248 )
249 print("-" * 70)
251 for model_name, results in sorted_models:
252 display_name = results["model_config"]["display_name"]
253 success_rate = results["summary"]["overall_success_rate"]
254 avg_time = results["summary"]["overall_timing_stats"]["mean"]
255 consistency = results["summary"]["overall_timing_stats"][
256 "consistency_score"
257 ]
258 status = "✅ PASS" if results["success"] else "❌ FAIL"
260 print(
261 f"{display_name:<20} {success_rate:>6.1f}% {avg_time:>6.2f}s {consistency:>6.1f}/100 {status}"
262 )
264 # Performance insights
265 fastest_model = min(
266 sorted_models, key=lambda x: x[1]["summary"]["overall_timing_stats"]["mean"]
267 )
268 most_consistent = max(
269 sorted_models,
270 key=lambda x: x[1]["summary"]["overall_timing_stats"]["consistency_score"],
271 )
273 print(f"\n🏆 Performance Insights:")
274 print(
275 f" Fastest: {fastest_model[1]['model_config']['display_name']} ({fastest_model[1]['summary']['overall_timing_stats']['mean']:.2f}s avg)"
276 )
277 print(
278 f" Most Consistent: {most_consistent[1]['model_config']['display_name']} ({most_consistent[1]['summary']['overall_timing_stats']['consistency_score']:.1f}/100)"
279 )
281 total_tests = sum(
282 r["summary"]["total_tests"] for r in self.all_results.values()
283 )
284 total_time = sum(r["total_runtime"] for r in self.all_results.values())
285 print(f" Total Tests: {total_tests}")
286 print(f" Total Runtime: {total_time:.1f}s")
288 def save_results_json(self, output_path: str) -> str:
289 """Save detailed results to JSON file for GitHub Pages.
291 Args:
292 output_path: Path where to save the results
294 Returns:
295 Path to the saved file
296 """
297 output_path = Path(output_path)
298 output_path.parent.mkdir(parents=True, exist_ok=True)
300 # Create the complete results structure for GitHub Pages
301 github_results = {
302 "metadata": {
303 "generated_at": datetime.now().isoformat(),
304 "config_file": str(self.config_path),
305 "total_models_tested": len(self.all_results),
306 "test_config": self.config["test_config"],
307 },
308 "models": [],
309 }
311 for model_name, results in self.all_results.items():
312 # Process detailed results for GitHub Pages format
313 processed_skills = {}
314 for skill_name, skill_data in results["detailed_results"].items():
315 processed_skills[skill_name] = {
316 "action_name": skill_data["results"]["action_name"],
317 "action_description": skill_data["results"]["action_description"],
318 "passed": skill_data["passed"],
319 "success_rate": skill_data["results"]["success_rate"],
320 "total_tests": skill_data["results"]["total_tests"],
321 "total_correct": skill_data["results"]["total_correct"],
322 "timing_stats": skill_data["results"]["overall_timing_stats"],
323 "phrase_results": {},
324 }
326 # Process phrase-level results
327 for phrase, phrase_data in skill_data["results"][
328 "phrase_results"
329 ].items():
330 processed_skills[skill_name]["phrase_results"][phrase] = {
331 "success_rate": phrase_data["success_rate"],
332 "timing_stats": phrase_data["timing_stats"],
333 "expected_params": phrase_data["expected_params"],
334 "secondary_actions": phrase_data["secondary_action_counts"],
335 }
337 model_result = {
338 "model_name": model_name,
339 "display_name": results["model_config"]["display_name"],
340 "description": results["model_config"]["description"],
341 "overall_success": results["success"],
342 "summary": results["summary"],
343 "skills": processed_skills,
344 "timestamp": results["timestamp"],
345 "iterations": results["iterations"],
346 }
347 github_results["models"].append(model_result)
349 # Save to file
350 with open(output_path, "w") as f:
351 json.dump(github_results, f, indent=2)
353 print(f"📁 Detailed results saved to: {output_path}")
354 return str(output_path)
356 def get_results_summary(self) -> Dict[str, Any]:
357 """Get a summary of results for external use.
359 Returns:
360 Dictionary with summary statistics
361 """
362 if not self.all_results:
363 return {}
365 return {
366 "total_models": len(self.all_results),
367 "models_passed": sum(1 for r in self.all_results.values() if r["success"]),
368 "models_failed": sum(
369 1 for r in self.all_results.values() if not r["success"]
370 ),
371 "average_success_rate": sum(
372 r["summary"]["overall_success_rate"] for r in self.all_results.values()
373 )
374 / len(self.all_results),
375 "total_runtime": sum(r["total_runtime"] for r in self.all_results.values()),
376 "fastest_model": (
377 min(
378 self.all_results.items(),
379 key=lambda x: x[1]["summary"]["overall_timing_stats"]["mean"],
380 )[0]
381 if self.all_results
382 else None
383 ),
384 "most_accurate_model": (
385 max(
386 self.all_results.items(),
387 key=lambda x: x[1]["summary"]["overall_success_rate"],
388 )[0]
389 if self.all_results
390 else None
391 ),
392 }
395def run_multi_model_tests(
396 config_path: Optional[str] = None,
397 iterations: Optional[int] = None,
398 output_path: Optional[str] = None,
399) -> bool:
400 """Convenience function to run multi-model vibe tests.
402 Args:
403 config_path: Path to model configuration file
404 iterations: Number of iterations per test
405 output_path: Path to save detailed JSON results
407 Returns:
408 True if all models passed tests, False otherwise
409 """
410 runner = MultiModelVibeTestRunner(config_path)
411 success = runner.run_all_model_tests(iterations)
413 if output_path:
414 runner.save_results_json(output_path)
416 return success