Coverage for src/ollamapy/multi_model_vibe

1"""Multi-model vibe test runner for comprehensive model comparison and GitHub Pages integration."""

3import json

4import time

5import os

6from pathlib import Path

7from typing import Dict, List, Any, Tuple, Optional

8from datetime import datetime

10from .vibe_tests import VibeTestRunner, TimingStats

11from .ollama_client import OllamaClient

14class MultiModelVibeTestRunner:

15 """Runs vibe tests across multiple models and generates comprehensive reports for GitHub Pages."""

17 def __init__(self, config_path: Optional[str] = None):

18 """Initialize the multi-model vibe test runner.

20 Args:

21 config_path: Path to the model configuration file

22 """

23 if config_path is None:

24 # Default to config/vibe_test_models.json

25 project_root = Path(__file__).parent.parent.parent

26 config_path = project_root / "config" / "vibe_test_models.json"

28 self.config_path = Path(config_path)

29 self.config = self._load_config()

30 self.client = OllamaClient()

31 self.all_results = {}

33 def _load_config(self) -> Dict[str, Any]:

34 """Load the model configuration."""

35 try:

36 with open(self.config_path, "r") as f:

37 return json.load(f)

38 except FileNotFoundError:

39 print(f"❌ Configuration file not found: {self.config_path}")

40 return self._get_default_config()

41 except json.JSONDecodeError as e:

42 print(f"❌ Invalid JSON in configuration file: {e}")

43 return self._get_default_config()

45 def _get_default_config(self) -> Dict[str, Any]:

46 """Get default configuration if config file is missing."""

47 return {

48 "models": [

49 {

50 "name": "gemma3:4b",

51 "display_name": "Gemma 3 4B",

52 "description": "Compact 4B parameter model optimized for speed",

53 }

54 ],

55 "test_config": {

56 "iterations": 5,

57 "timeout": 120,

58 "collect_runtime_stats": True,

59 "include_performance_metrics": True,

60 },

61 }

63 def check_model_availability(self, model_name: str, timeout: int = 60) -> bool:

64 """Check if a model is available in Ollama.

66 Args:

67 model_name: Name of the model to check

68 timeout: Timeout in seconds for the availability check

69 """

70 try:

71 # Try to generate a simple response to test availability with timeout

72 import signal

74 def timeout_handler(signum, frame):

75 raise TimeoutError(

76 f"Model availability check timed out after {timeout}s"

77 )

79 # Set timeout for the check

80 signal.signal(signal.SIGALRM, timeout_handler)

81 signal.alarm(timeout)

83 try:

84 response = self.client.generate(model=model_name, prompt="Hello")

85 signal.alarm(0) # Cancel timeout

86 return response is not None

87 finally:

88 signal.alarm(0) # Ensure timeout is cancelled

90 except TimeoutError as e:

91 print(f"❌ Model {model_name} availability check timed out: {e}")

92 return False

93 except Exception as e:

94 print(f"❌ Model {model_name} not available: {e}")

95 return False

97 def run_tests_for_model(

98 self, model_config: Dict[str, str], iterations: int

99 ) -> Dict[str, Any]:

100 """Run vibe tests for a single model.

101

102 Args:

103 model_config: Model configuration dictionary

104 iterations: Number of iterations per test

105

106 Returns:

107 Dictionary containing test results and metadata

108 """

109 model_name = model_config["name"]

110 print(f"\n🧪 Testing Model: {model_config['display_name']} ({model_name})")

111 print(f"📄 Description: {model_config['description']}")

112 print("=" * 80)

113

114 start_time = time.perf_counter()

115

116 # Create a vibe test runner for this model

117 # Use the same model for both chat and analysis to get pure model performance

118 runner = VibeTestRunner(model=model_name, analysis_model=model_name)

119

120 # Run the tests

121 success = runner.run_all_tests(iterations=iterations)

122

123 end_time = time.perf_counter()

124 total_runtime = end_time - start_time

125

126 # Get the detailed results from the runner

127 detailed_results = runner.all_test_results

128

129 # Aggregate statistics

130 total_tests = sum(

131 result["results"]["total_tests"] for result in detailed_results.values()

132 )

133 total_correct = sum(

134 result["results"]["total_correct"] for result in detailed_results.values()

135 )

136 overall_success_rate = (

137 (total_correct / total_tests * 100) if total_tests > 0 else 0

138 )

139

140 # Calculate overall timing statistics

141 all_times = []

142 for result in detailed_results.values():

143 all_times.extend(result["results"]["overall_timing_stats"]["raw_times"])

144

145 overall_timing = TimingStats(all_times) if all_times else TimingStats([])

146

147 return {

148 "model_config": model_config,

149 "success": success,

150 "total_runtime": total_runtime,

151 "summary": {

152 "total_tests": total_tests,

153 "total_correct": total_correct,

154 "overall_success_rate": overall_success_rate,

155 "overall_timing_stats": overall_timing.to_dict(),

156 },

157 "detailed_results": detailed_results,

158 "timestamp": datetime.now().isoformat(),

159 "iterations": iterations,

160 }

161

162 def run_all_model_tests(self, iterations: Optional[int] = None) -> bool:

163 """Run vibe tests for all configured models.

164

165 Args:

166 iterations: Override for number of iterations (uses config default if None)

167

168 Returns:

169 True if all models passed their tests, False otherwise

170 """

171 if iterations is None:

172 iterations = self.config["test_config"]["iterations"]

173

174 # Filter only enabled models

175 enabled_models = [m for m in self.config["models"] if m.get("enabled", True)]

176 total_models = len(enabled_models)

177 disabled_count = len(self.config["models"]) - total_models

178

179 print("🌟 Multi-Model Vibe Test Suite")

180 print(

181 f"📋 Testing {total_models} enabled models with {iterations} iterations each"

182 )

183 if disabled_count > 0:

184 print(f"⚠️ Skipping {disabled_count} disabled models")

185 print(f"📊 Collecting runtime statistics and performance metrics")

186 print("=" * 80)

187

188 # Check Ollama availability

189 if not self.client.is_available():

190 print(

191 "❌ Ollama server is not available. Please start it with: ollama serve"

192 )

193 return False

194

195 all_success = True

196 self.all_results = {}

197

198 for i, model_config in enumerate(enabled_models, 1):

199 model_name = model_config["name"]

200 model_timeout = model_config.get("timeout", 60)

201

202 print(

203 f"\n[{i}/{total_models}] Preparing to test {model_name}..."

204 )

205 print(f"⏱️ Model timeout: {model_timeout}s")

206

207 # Check if model is available with model-specific timeout

208 if not self.check_model_availability(model_name, model_timeout):

209 print(f"❌ Skipping {model_name} - not available")

210 continue

211

212 # Run tests for this model

213 try:

214 results = self.run_tests_for_model(model_config, iterations)

215 self.all_results[model_name] = results

216

217 if not results["success"]:

218 all_success = False

219

220 except Exception as e:

221 print(f"❌ Error testing {model_name}: {e}")

222 all_success = False

223 continue

224

225 # Generate comparison report

226 self._print_comparison_summary()

227

228 return all_success

229

230 def _print_comparison_summary(self):

231 """Print a comparison summary of all tested models."""

232 if not self.all_results:

233 return

234

235 print("\n" + "=" * 80)

236 print("📊 Multi-Model Comparison Summary")

237 print("=" * 80)

238

239 # Sort models by overall success rate

240 sorted_models = sorted(

241 self.all_results.items(),

242 key=lambda x: x[1]["summary"]["overall_success_rate"],

243 reverse=True,

244 )

245

246 print(

247 f"{'Model':<20} {'Success Rate':<12} {'Avg Time':<10} {'Consistency':<12} {'Status':<10}"

248 )

249 print("-" * 70)

250

251 for model_name, results in sorted_models:

252 display_name = results["model_config"]["display_name"]

253 success_rate = results["summary"]["overall_success_rate"]

254 avg_time = results["summary"]["overall_timing_stats"]["mean"]

255 consistency = results["summary"]["overall_timing_stats"][

256 "consistency_score"

257 ]

258 status = "✅ PASS" if results["success"] else "❌ FAIL"

259

260 print(

261 f"{display_name:<20} {success_rate:>6.1f}% {avg_time:>6.2f}s {consistency:>6.1f}/100 {status}"

262 )

263

264 # Performance insights

265 fastest_model = min(

266 sorted_models, key=lambda x: x[1]["summary"]["overall_timing_stats"]["mean"]

267 )

268 most_consistent = max(

269 sorted_models,

270 key=lambda x: x[1]["summary"]["overall_timing_stats"]["consistency_score"],

271 )

272

273 print(f"\n🏆 Performance Insights:")

274 print(

275 f" Fastest: {fastest_model[1]['model_config']['display_name']} ({fastest_model[1]['summary']['overall_timing_stats']['mean']:.2f}s avg)"

276 )

277 print(

278 f" Most Consistent: {most_consistent[1]['model_config']['display_name']} ({most_consistent[1]['summary']['overall_timing_stats']['consistency_score']:.1f}/100)"

279 )

280

281 total_tests = sum(

282 r["summary"]["total_tests"] for r in self.all_results.values()

283 )

284 total_time = sum(r["total_runtime"] for r in self.all_results.values())

285 print(f" Total Tests: {total_tests}")

286 print(f" Total Runtime: {total_time:.1f}s")

287

288 def save_results_json(self, output_path: str) -> str:

289 """Save detailed results to JSON file for GitHub Pages.

290

291 Args:

292 output_path: Path where to save the results

293

294 Returns:

295 Path to the saved file

296 """

297 output_path = Path(output_path)

298 output_path.parent.mkdir(parents=True, exist_ok=True)

299

300 # Create the complete results structure for GitHub Pages

301 github_results = {

302 "metadata": {

303 "generated_at": datetime.now().isoformat(),

304 "config_file": str(self.config_path),

305 "total_models_tested": len(self.all_results),

306 "test_config": self.config["test_config"],

307 },

308 "models": [],

309 }

310

311 for model_name, results in self.all_results.items():

312 # Process detailed results for GitHub Pages format

313 processed_skills = {}

314 for skill_name, skill_data in results["detailed_results"].items():

315 processed_skills[skill_name] = {

316 "action_name": skill_data["results"]["action_name"],

317 "action_description": skill_data["results"]["action_description"],

318 "passed": skill_data["passed"],

319 "success_rate": skill_data["results"]["success_rate"],

320 "total_tests": skill_data["results"]["total_tests"],

321 "total_correct": skill_data["results"]["total_correct"],

322 "timing_stats": skill_data["results"]["overall_timing_stats"],

323 "phrase_results": {},

324 }

325

326 # Process phrase-level results

327 for phrase, phrase_data in skill_data["results"][

328 "phrase_results"

329 ].items():

330 processed_skills[skill_name]["phrase_results"][phrase] = {

331 "success_rate": phrase_data["success_rate"],

332 "timing_stats": phrase_data["timing_stats"],

333 "expected_params": phrase_data["expected_params"],

334 "secondary_actions": phrase_data["secondary_action_counts"],

335 }

336

337 model_result = {

338 "model_name": model_name,

339 "display_name": results["model_config"]["display_name"],

340 "description": results["model_config"]["description"],

341 "overall_success": results["success"],

342 "summary": results["summary"],

343 "skills": processed_skills,

344 "timestamp": results["timestamp"],

345 "iterations": results["iterations"],

346 }

347 github_results["models"].append(model_result)

348

349 # Save to file

350 with open(output_path, "w") as f:

351 json.dump(github_results, f, indent=2)

352

353 print(f"📁 Detailed results saved to: {output_path}")

354 return str(output_path)

355

356 def get_results_summary(self) -> Dict[str, Any]:

357 """Get a summary of results for external use.

358

359 Returns:

360 Dictionary with summary statistics

361 """

362 if not self.all_results:

363 return {}

364

365 return {

366 "total_models": len(self.all_results),

367 "models_passed": sum(1 for r in self.all_results.values() if r["success"]),

368 "models_failed": sum(

369 1 for r in self.all_results.values() if not r["success"]

370 ),

371 "average_success_rate": sum(

372 r["summary"]["overall_success_rate"] for r in self.all_results.values()

373 )

374 / len(self.all_results),

375 "total_runtime": sum(r["total_runtime"] for r in self.all_results.values()),

376 "fastest_model": (

377 min(

378 self.all_results.items(),

379 key=lambda x: x[1]["summary"]["overall_timing_stats"]["mean"],

380 )[0]

381 if self.all_results

382 else None

383 ),

384 "most_accurate_model": (

385 max(

386 self.all_results.items(),

387 key=lambda x: x[1]["summary"]["overall_success_rate"],

388 )[0]

389 if self.all_results

390 else None

391 ),

392 }

393

394

395def run_multi_model_tests(

396 config_path: Optional[str] = None,

397 iterations: Optional[int] = None,

398 output_path: Optional[str] = None,

399) -> bool:

400 """Convenience function to run multi-model vibe tests.

401

402 Args:

403 config_path: Path to model configuration file

404 iterations: Number of iterations per test

405 output_path: Path to save detailed JSON results

406

407 Returns:

408 True if all models passed tests, False otherwise

409 """

410 runner = MultiModelVibeTestRunner(config_path)

411 success = runner.run_all_model_tests(iterations)

412

413 if output_path:

414 runner.save_results_json(output_path)

415

416 return success

Coverage for src/ollamapy/multi_model_vibe_tests.py: 0%

153 statements