Coverage for src/ollamapy/multi_model_vibe_tests.py: 0%

153 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-01 12:29 -0400

1"""Multi-model vibe test runner for comprehensive model comparison and GitHub Pages integration.""" 

2 

3import json 

4import time 

5import os 

6from pathlib import Path 

7from typing import Dict, List, Any, Tuple, Optional 

8from datetime import datetime 

9 

10from .vibe_tests import VibeTestRunner, TimingStats 

11from .ollama_client import OllamaClient 

12 

13 

14class MultiModelVibeTestRunner: 

15 """Runs vibe tests across multiple models and generates comprehensive reports for GitHub Pages.""" 

16 

17 def __init__(self, config_path: Optional[str] = None): 

18 """Initialize the multi-model vibe test runner. 

19 

20 Args: 

21 config_path: Path to the model configuration file 

22 """ 

23 if config_path is None: 

24 # Default to config/vibe_test_models.json 

25 project_root = Path(__file__).parent.parent.parent 

26 config_path = project_root / "config" / "vibe_test_models.json" 

27 

28 self.config_path = Path(config_path) 

29 self.config = self._load_config() 

30 self.client = OllamaClient() 

31 self.all_results = {} 

32 

33 def _load_config(self) -> Dict[str, Any]: 

34 """Load the model configuration.""" 

35 try: 

36 with open(self.config_path, "r") as f: 

37 return json.load(f) 

38 except FileNotFoundError: 

39 print(f"❌ Configuration file not found: {self.config_path}") 

40 return self._get_default_config() 

41 except json.JSONDecodeError as e: 

42 print(f"❌ Invalid JSON in configuration file: {e}") 

43 return self._get_default_config() 

44 

45 def _get_default_config(self) -> Dict[str, Any]: 

46 """Get default configuration if config file is missing.""" 

47 return { 

48 "models": [ 

49 { 

50 "name": "gemma3:4b", 

51 "display_name": "Gemma 3 4B", 

52 "description": "Compact 4B parameter model optimized for speed", 

53 } 

54 ], 

55 "test_config": { 

56 "iterations": 5, 

57 "timeout": 120, 

58 "collect_runtime_stats": True, 

59 "include_performance_metrics": True, 

60 }, 

61 } 

62 

63 def check_model_availability(self, model_name: str, timeout: int = 60) -> bool: 

64 """Check if a model is available in Ollama. 

65 

66 Args: 

67 model_name: Name of the model to check 

68 timeout: Timeout in seconds for the availability check 

69 """ 

70 try: 

71 # Try to generate a simple response to test availability with timeout 

72 import signal 

73 

74 def timeout_handler(signum, frame): 

75 raise TimeoutError( 

76 f"Model availability check timed out after {timeout}s" 

77 ) 

78 

79 # Set timeout for the check 

80 signal.signal(signal.SIGALRM, timeout_handler) 

81 signal.alarm(timeout) 

82 

83 try: 

84 response = self.client.generate(model=model_name, prompt="Hello") 

85 signal.alarm(0) # Cancel timeout 

86 return response is not None 

87 finally: 

88 signal.alarm(0) # Ensure timeout is cancelled 

89 

90 except TimeoutError as e: 

91 print(f"❌ Model {model_name} availability check timed out: {e}") 

92 return False 

93 except Exception as e: 

94 print(f"❌ Model {model_name} not available: {e}") 

95 return False 

96 

97 def run_tests_for_model( 

98 self, model_config: Dict[str, str], iterations: int 

99 ) -> Dict[str, Any]: 

100 """Run vibe tests for a single model. 

101 

102 Args: 

103 model_config: Model configuration dictionary 

104 iterations: Number of iterations per test 

105 

106 Returns: 

107 Dictionary containing test results and metadata 

108 """ 

109 model_name = model_config["name"] 

110 print(f"\n🧪 Testing Model: {model_config['display_name']} ({model_name})") 

111 print(f"📄 Description: {model_config['description']}") 

112 print("=" * 80) 

113 

114 start_time = time.perf_counter() 

115 

116 # Create a vibe test runner for this model 

117 # Use the same model for both chat and analysis to get pure model performance 

118 runner = VibeTestRunner(model=model_name, analysis_model=model_name) 

119 

120 # Run the tests 

121 success = runner.run_all_tests(iterations=iterations) 

122 

123 end_time = time.perf_counter() 

124 total_runtime = end_time - start_time 

125 

126 # Get the detailed results from the runner 

127 detailed_results = runner.all_test_results 

128 

129 # Aggregate statistics 

130 total_tests = sum( 

131 result["results"]["total_tests"] for result in detailed_results.values() 

132 ) 

133 total_correct = sum( 

134 result["results"]["total_correct"] for result in detailed_results.values() 

135 ) 

136 overall_success_rate = ( 

137 (total_correct / total_tests * 100) if total_tests > 0 else 0 

138 ) 

139 

140 # Calculate overall timing statistics 

141 all_times = [] 

142 for result in detailed_results.values(): 

143 all_times.extend(result["results"]["overall_timing_stats"]["raw_times"]) 

144 

145 overall_timing = TimingStats(all_times) if all_times else TimingStats([]) 

146 

147 return { 

148 "model_config": model_config, 

149 "success": success, 

150 "total_runtime": total_runtime, 

151 "summary": { 

152 "total_tests": total_tests, 

153 "total_correct": total_correct, 

154 "overall_success_rate": overall_success_rate, 

155 "overall_timing_stats": overall_timing.to_dict(), 

156 }, 

157 "detailed_results": detailed_results, 

158 "timestamp": datetime.now().isoformat(), 

159 "iterations": iterations, 

160 } 

161 

162 def run_all_model_tests(self, iterations: Optional[int] = None) -> bool: 

163 """Run vibe tests for all configured models. 

164 

165 Args: 

166 iterations: Override for number of iterations (uses config default if None) 

167 

168 Returns: 

169 True if all models passed their tests, False otherwise 

170 """ 

171 if iterations is None: 

172 iterations = self.config["test_config"]["iterations"] 

173 

174 # Filter only enabled models 

175 enabled_models = [m for m in self.config["models"] if m.get("enabled", True)] 

176 total_models = len(enabled_models) 

177 disabled_count = len(self.config["models"]) - total_models 

178 

179 print("🌟 Multi-Model Vibe Test Suite") 

180 print( 

181 f"📋 Testing {total_models} enabled models with {iterations} iterations each" 

182 ) 

183 if disabled_count > 0: 

184 print(f"⚠️ Skipping {disabled_count} disabled models") 

185 print(f"📊 Collecting runtime statistics and performance metrics") 

186 print("=" * 80) 

187 

188 # Check Ollama availability 

189 if not self.client.is_available(): 

190 print( 

191 "❌ Ollama server is not available. Please start it with: ollama serve" 

192 ) 

193 return False 

194 

195 all_success = True 

196 self.all_results = {} 

197 

198 for i, model_config in enumerate(enabled_models, 1): 

199 model_name = model_config["name"] 

200 model_timeout = model_config.get("timeout", 60) 

201 

202 print( 

203 f"\n[{i}/{total_models}] Preparing to test {model_name}..." 

204 ) 

205 print(f"⏱️ Model timeout: {model_timeout}s") 

206 

207 # Check if model is available with model-specific timeout 

208 if not self.check_model_availability(model_name, model_timeout): 

209 print(f"❌ Skipping {model_name} - not available") 

210 continue 

211 

212 # Run tests for this model 

213 try: 

214 results = self.run_tests_for_model(model_config, iterations) 

215 self.all_results[model_name] = results 

216 

217 if not results["success"]: 

218 all_success = False 

219 

220 except Exception as e: 

221 print(f"❌ Error testing {model_name}: {e}") 

222 all_success = False 

223 continue 

224 

225 # Generate comparison report 

226 self._print_comparison_summary() 

227 

228 return all_success 

229 

230 def _print_comparison_summary(self): 

231 """Print a comparison summary of all tested models.""" 

232 if not self.all_results: 

233 return 

234 

235 print("\n" + "=" * 80) 

236 print("📊 Multi-Model Comparison Summary") 

237 print("=" * 80) 

238 

239 # Sort models by overall success rate 

240 sorted_models = sorted( 

241 self.all_results.items(), 

242 key=lambda x: x[1]["summary"]["overall_success_rate"], 

243 reverse=True, 

244 ) 

245 

246 print( 

247 f"{'Model':<20} {'Success Rate':<12} {'Avg Time':<10} {'Consistency':<12} {'Status':<10}" 

248 ) 

249 print("-" * 70) 

250 

251 for model_name, results in sorted_models: 

252 display_name = results["model_config"]["display_name"] 

253 success_rate = results["summary"]["overall_success_rate"] 

254 avg_time = results["summary"]["overall_timing_stats"]["mean"] 

255 consistency = results["summary"]["overall_timing_stats"][ 

256 "consistency_score" 

257 ] 

258 status = "✅ PASS" if results["success"] else "❌ FAIL" 

259 

260 print( 

261 f"{display_name:<20} {success_rate:>6.1f}% {avg_time:>6.2f}s {consistency:>6.1f}/100 {status}" 

262 ) 

263 

264 # Performance insights 

265 fastest_model = min( 

266 sorted_models, key=lambda x: x[1]["summary"]["overall_timing_stats"]["mean"] 

267 ) 

268 most_consistent = max( 

269 sorted_models, 

270 key=lambda x: x[1]["summary"]["overall_timing_stats"]["consistency_score"], 

271 ) 

272 

273 print(f"\n🏆 Performance Insights:") 

274 print( 

275 f" Fastest: {fastest_model[1]['model_config']['display_name']} ({fastest_model[1]['summary']['overall_timing_stats']['mean']:.2f}s avg)" 

276 ) 

277 print( 

278 f" Most Consistent: {most_consistent[1]['model_config']['display_name']} ({most_consistent[1]['summary']['overall_timing_stats']['consistency_score']:.1f}/100)" 

279 ) 

280 

281 total_tests = sum( 

282 r["summary"]["total_tests"] for r in self.all_results.values() 

283 ) 

284 total_time = sum(r["total_runtime"] for r in self.all_results.values()) 

285 print(f" Total Tests: {total_tests}") 

286 print(f" Total Runtime: {total_time:.1f}s") 

287 

288 def save_results_json(self, output_path: str) -> str: 

289 """Save detailed results to JSON file for GitHub Pages. 

290 

291 Args: 

292 output_path: Path where to save the results 

293 

294 Returns: 

295 Path to the saved file 

296 """ 

297 output_path = Path(output_path) 

298 output_path.parent.mkdir(parents=True, exist_ok=True) 

299 

300 # Create the complete results structure for GitHub Pages 

301 github_results = { 

302 "metadata": { 

303 "generated_at": datetime.now().isoformat(), 

304 "config_file": str(self.config_path), 

305 "total_models_tested": len(self.all_results), 

306 "test_config": self.config["test_config"], 

307 }, 

308 "models": [], 

309 } 

310 

311 for model_name, results in self.all_results.items(): 

312 # Process detailed results for GitHub Pages format 

313 processed_skills = {} 

314 for skill_name, skill_data in results["detailed_results"].items(): 

315 processed_skills[skill_name] = { 

316 "action_name": skill_data["results"]["action_name"], 

317 "action_description": skill_data["results"]["action_description"], 

318 "passed": skill_data["passed"], 

319 "success_rate": skill_data["results"]["success_rate"], 

320 "total_tests": skill_data["results"]["total_tests"], 

321 "total_correct": skill_data["results"]["total_correct"], 

322 "timing_stats": skill_data["results"]["overall_timing_stats"], 

323 "phrase_results": {}, 

324 } 

325 

326 # Process phrase-level results 

327 for phrase, phrase_data in skill_data["results"][ 

328 "phrase_results" 

329 ].items(): 

330 processed_skills[skill_name]["phrase_results"][phrase] = { 

331 "success_rate": phrase_data["success_rate"], 

332 "timing_stats": phrase_data["timing_stats"], 

333 "expected_params": phrase_data["expected_params"], 

334 "secondary_actions": phrase_data["secondary_action_counts"], 

335 } 

336 

337 model_result = { 

338 "model_name": model_name, 

339 "display_name": results["model_config"]["display_name"], 

340 "description": results["model_config"]["description"], 

341 "overall_success": results["success"], 

342 "summary": results["summary"], 

343 "skills": processed_skills, 

344 "timestamp": results["timestamp"], 

345 "iterations": results["iterations"], 

346 } 

347 github_results["models"].append(model_result) 

348 

349 # Save to file 

350 with open(output_path, "w") as f: 

351 json.dump(github_results, f, indent=2) 

352 

353 print(f"📁 Detailed results saved to: {output_path}") 

354 return str(output_path) 

355 

356 def get_results_summary(self) -> Dict[str, Any]: 

357 """Get a summary of results for external use. 

358 

359 Returns: 

360 Dictionary with summary statistics 

361 """ 

362 if not self.all_results: 

363 return {} 

364 

365 return { 

366 "total_models": len(self.all_results), 

367 "models_passed": sum(1 for r in self.all_results.values() if r["success"]), 

368 "models_failed": sum( 

369 1 for r in self.all_results.values() if not r["success"] 

370 ), 

371 "average_success_rate": sum( 

372 r["summary"]["overall_success_rate"] for r in self.all_results.values() 

373 ) 

374 / len(self.all_results), 

375 "total_runtime": sum(r["total_runtime"] for r in self.all_results.values()), 

376 "fastest_model": ( 

377 min( 

378 self.all_results.items(), 

379 key=lambda x: x[1]["summary"]["overall_timing_stats"]["mean"], 

380 )[0] 

381 if self.all_results 

382 else None 

383 ), 

384 "most_accurate_model": ( 

385 max( 

386 self.all_results.items(), 

387 key=lambda x: x[1]["summary"]["overall_success_rate"], 

388 )[0] 

389 if self.all_results 

390 else None 

391 ), 

392 } 

393 

394 

395def run_multi_model_tests( 

396 config_path: Optional[str] = None, 

397 iterations: Optional[int] = None, 

398 output_path: Optional[str] = None, 

399) -> bool: 

400 """Convenience function to run multi-model vibe tests. 

401 

402 Args: 

403 config_path: Path to model configuration file 

404 iterations: Number of iterations per test 

405 output_path: Path to save detailed JSON results 

406 

407 Returns: 

408 True if all models passed tests, False otherwise 

409 """ 

410 runner = MultiModelVibeTestRunner(config_path) 

411 success = runner.run_all_model_tests(iterations) 

412 

413 if output_path: 

414 runner.save_results_json(output_path) 

415 

416 return success