Coverage for src/ollamapy/vibe_tests.py: 12%

275 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-01 12:29 -0400

1"""Built-in vibe tests for evaluating AI decision-making consistency with timing analysis and visual reporting.""" 

2 

3import re 

4import time 

5import statistics 

6from typing import List, Dict, Tuple, Any 

7from .ollama_client import OllamaClient 

8from .model_manager import ModelManager 

9from .analysis_engine import AnalysisEngine 

10from .skills import get_actions_with_vibe_tests, clear_action_logs 

11from .vibe_report import VibeTestReportGenerator 

12 

13 

14class TimingStats: 

15 """Container for timing statistics with helpful analysis methods.""" 

16 

17 def __init__(self, times: List[float]): 

18 """Initialize timing stats from a list of execution times. 

19 

20 Args: 

21 times: List of execution times in seconds 

22 """ 

23 self.times = times 

24 self.count = len(times) 

25 

26 if times: 

27 self.mean = statistics.mean(times) 

28 self.median = statistics.median(times) 

29 self.min = min(times) 

30 self.max = max(times) 

31 self.std_dev = statistics.stdev(times) if len(times) > 1 else 0.0 

32 

33 # Calculate percentiles for more insight 

34 sorted_times = sorted(times) 

35 self.p25 = self._percentile(sorted_times, 25) 

36 self.p75 = self._percentile(sorted_times, 75) 

37 self.p95 = self._percentile(sorted_times, 95) 

38 else: 

39 self.mean = self.median = self.min = self.max = self.std_dev = 0.0 

40 self.p25 = self.p75 = self.p95 = 0.0 

41 

42 def _percentile(self, sorted_data: List[float], percentile: float) -> float: 

43 """Calculate percentile from sorted data.""" 

44 if not sorted_data: 

45 return 0.0 

46 

47 k = (len(sorted_data) - 1) * (percentile / 100.0) 

48 f = int(k) 

49 c = k - f 

50 

51 if f + 1 < len(sorted_data): 

52 return sorted_data[f] + c * (sorted_data[f + 1] - sorted_data[f]) 

53 else: 

54 return sorted_data[f] 

55 

56 @property 

57 def consistency_score(self) -> float: 

58 """Calculate a consistency score (0-100) based on timing variability. 

59 

60 Lower variance relative to mean indicates higher consistency. 

61 

62 Returns: 

63 Consistency score from 0 (very inconsistent) to 100 (very consistent) 

64 """ 

65 if self.mean == 0 or self.count < 2: 

66 return 100.0 

67 

68 # Coefficient of variation (CV) = std_dev / mean 

69 cv = self.std_dev / self.mean 

70 

71 # Convert CV to consistency score (lower CV = higher consistency) 

72 # CV of 0.1 (10%) = 90 consistency, CV of 0.5 (50%) = 50 consistency 

73 consistency = max(0, 100 - (cv * 200)) 

74 return min(100, consistency) 

75 

76 @property 

77 def performance_category(self) -> str: 

78 """Categorize performance based on mean response time.""" 

79 if self.mean < 1.0: 

80 return "Very Fast" 

81 elif self.mean < 2.0: 

82 return "Fast" 

83 elif self.mean < 5.0: 

84 return "Moderate" 

85 elif self.mean < 10.0: 

86 return "Slow" 

87 else: 

88 return "Very Slow" 

89 

90 def to_dict(self) -> Dict[str, Any]: 

91 """Convert timing stats to dictionary for serialization.""" 

92 return { 

93 "count": self.count, 

94 "mean": self.mean, 

95 "median": self.median, 

96 "min": self.min, 

97 "max": self.max, 

98 "std_dev": self.std_dev, 

99 "p25": self.p25, 

100 "p75": self.p75, 

101 "p95": self.p95, 

102 "consistency_score": self.consistency_score, 

103 "performance_category": self.performance_category, 

104 "raw_times": self.times, 

105 } 

106 

107 

108class VibeTestRunner: 

109 """Built-in vibe test runner with multi-action support, timing analysis, and visual reporting. 

110 

111 Tests check if the target action is selected, regardless of what other 

112 actions might also be selected, and generates comprehensive visual reports 

113 including timing analysis. 

114 """ 

115 

116 def __init__(self, model: str = "gemma3:4b", analysis_model: str = "gemma3:4b"): 

117 """Initialize the vibe test runner. 

118 

119 Args: 

120 model: The model to use for testing 

121 analysis_model: Optional separate model for action analysis (defaults to main model) 

122 """ 

123 self.model = model 

124 self.analysis_model = analysis_model or model 

125 self.client = OllamaClient() 

126 self.model_manager = ModelManager(self.client) 

127 self.analysis_engine = AnalysisEngine(self.analysis_model, self.client) 

128 self.actions_with_tests = get_actions_with_vibe_tests() 

129 self.all_test_results = {} # Store all results for report generation 

130 

131 def check_prerequisites(self) -> bool: 

132 """Check if Ollama is available and models can be used.""" 

133 success, main_status, analysis_status = ( 

134 self.model_manager.ensure_models_available(self.model, self.analysis_model) 

135 ) 

136 

137 if not success: 

138 print("❌ Error: Ollama server is not running!") 

139 print("Please start Ollama with: ollama serve") 

140 return False 

141 

142 return True 

143 

144 def extract_expected_parameters( 

145 self, phrase: str, action_name: str 

146 ) -> Dict[str, Any]: 

147 """Extract expected parameter values from test phrases. 

148 

149 Args: 

150 phrase: The test phrase 

151 action_name: The action being tested 

152 

153 Returns: 

154 Dictionary of expected parameter values 

155 """ 

156 expected_params = {} 

157 

158 # Extract numbers for square_root 

159 if action_name == "square_root": 

160 # Look for numbers in the phrase 

161 numbers = re.findall(r"\d+(?:\.\d+)?", phrase) 

162 if numbers: 

163 expected_params["number"] = float(numbers[0]) 

164 

165 # Extract expressions for calculate 

166 elif action_name == "calculate": 

167 # Look for mathematical expressions 

168 # Simple pattern for basic arithmetic 

169 expr_match = re.search(r"(\d+\s*[+\-*/]\s*\d+)", phrase) 

170 if expr_match: 

171 expected_params["expression"] = expr_match.group(1).replace(" ", "") 

172 

173 # Extract location for weather (if mentioned) 

174 elif action_name == "getWeather": 

175 # Look for common city names or location indicators 

176 location_keywords = ["in", "at", "for"] 

177 for keyword in location_keywords: 

178 if keyword in phrase.lower(): 

179 parts = phrase.lower().split(keyword) 

180 if len(parts) > 1: 

181 potential_location_parts = parts[1].strip().split() 

182 if potential_location_parts: 

183 potential_location = potential_location_parts[0] 

184 if len(potential_location) > 2: 

185 expected_params["location"] = potential_location 

186 break 

187 

188 return expected_params 

189 

190 def time_analysis_execution( 

191 self, phrase: str 

192 ) -> Tuple[List[Tuple[str, Dict[str, Any]]], float]: 

193 """Time the execution of analysis engine action selection. 

194 

195 Args: 

196 phrase: The phrase to analyze 

197 

198 Returns: 

199 Tuple of (selected_actions, execution_time_seconds) 

200 """ 

201 start_time = time.perf_counter() 

202 

203 try: 

204 # Clear any previous logs 

205 clear_action_logs() 

206 

207 # Run the multi-action analysis 

208 selected_actions = self.analysis_engine.select_all_applicable_actions( 

209 phrase 

210 ) 

211 

212 end_time = time.perf_counter() 

213 execution_time = end_time - start_time 

214 

215 return selected_actions, execution_time 

216 

217 except Exception as e: 

218 end_time = time.perf_counter() 

219 execution_time = end_time - start_time 

220 print(f"❌ Error during analysis timing: {e}") 

221 return [], execution_time 

222 

223 def run_action_test( 

224 self, action_name: str, action_info: Dict, phrases: List[str], iterations: int 

225 ) -> Tuple[bool, Dict]: 

226 """Run a test on a specific action with its phrases, including timing analysis. 

227 

228 Tests if the target action is selected (other actions may also be selected). 

229 

230 Args: 

231 action_name: Name of the action being tested 

232 action_info: Information about the action (description, etc.) 

233 phrases: List of test phrases for this action 

234 iterations: Number of times to test each phrase 

235 

236 Returns: 

237 Tuple of (success: bool, results: dict) 

238 """ 

239 total_correct = 0 

240 total_tests = 0 

241 results = {} 

242 

243 print(f"\n🧪 {action_name} Action Test") 

244 print(f"Chat Model: {self.model}") 

245 if self.analysis_model != self.model: 

246 print(f"Analysis Model: {self.analysis_model}") 

247 else: 

248 print("Using same model for analysis and chat") 

249 print("Mode: Multi-action selection (target action must be selected)") 

250 print("=" * 80) 

251 

252 for phrase in phrases: 

253 phrase_correct = 0 

254 parameter_correct = 0 

255 expected_params = self.extract_expected_parameters(phrase, action_name) 

256 

257 # Track secondary actions and timing per iteration for this phrase 

258 secondary_actions_per_iteration = [] 

259 execution_times = [] 

260 

261 for i in range(iterations): 

262 try: 

263 # Time the analysis execution 

264 selected_actions, execution_time = self.time_analysis_execution( 

265 phrase 

266 ) 

267 execution_times.append(execution_time) 

268 

269 # Check if target action was selected and track secondary actions 

270 action_found = False 

271 params_match = False 

272 iteration_secondary_actions = [] 

273 

274 for selected_action, parameters in selected_actions: 

275 if selected_action == action_name: 

276 action_found = True 

277 phrase_correct += 1 

278 

279 # Check parameters if expected 

280 if expected_params: 

281 params_match = True 

282 for ( 

283 param_name, 

284 expected_value, 

285 ) in expected_params.items(): 

286 if param_name in parameters: 

287 actual_value = parameters[param_name] 

288 # For numbers, check if they're close enough 

289 if isinstance(expected_value, (int, float)): 

290 try: 

291 actual_float = float(actual_value) 

292 if ( 

293 abs(actual_float - expected_value) 

294 < 0.001 

295 ): 

296 parameter_correct += 1 

297 else: 

298 params_match = False 

299 except: 

300 params_match = False 

301 # For strings, check exact match 

302 elif str(actual_value) == str(expected_value): 

303 parameter_correct += 1 

304 else: 

305 params_match = False 

306 else: 

307 params_match = False 

308 else: 

309 # This is a secondary action 

310 iteration_secondary_actions.append(selected_action) 

311 

312 secondary_actions_per_iteration.append(iteration_secondary_actions) 

313 total_tests += 1 

314 

315 except Exception as e: 

316 print(f"❌ Error testing phrase iteration {i+1}: {e}") 

317 secondary_actions_per_iteration.append([]) 

318 # Still record the time even if there was an error 

319 if len(execution_times) <= i: 

320 execution_times.append(0.0) 

321 continue 

322 

323 # Calculate secondary action frequencies 

324 secondary_action_counts = {} 

325 for iteration_actions in secondary_actions_per_iteration: 

326 for action in iteration_actions: 

327 secondary_action_counts[action] = ( 

328 secondary_action_counts.get(action, 0) + 1 

329 ) 

330 

331 # Calculate timing statistics 

332 timing_stats = TimingStats(execution_times) 

333 

334 success_rate = (phrase_correct / iterations) * 100 if iterations > 0 else 0 

335 param_success_rate = ( 

336 (parameter_correct / iterations) * 100 

337 if iterations > 0 and expected_params 

338 else 100 

339 ) 

340 

341 results[phrase] = { 

342 "correct": phrase_correct, 

343 "total": iterations, 

344 "success_rate": success_rate, 

345 "parameter_success_rate": param_success_rate, 

346 "expected_params": expected_params, 

347 "secondary_action_counts": secondary_action_counts, 

348 "secondary_actions_per_iteration": secondary_actions_per_iteration, 

349 "timing_stats": timing_stats.to_dict(), 

350 } 

351 total_correct += phrase_correct 

352 

353 # Print individual results with timing 

354 phrase_display = phrase[:50] + "..." if len(phrase) > 50 else phrase 

355 print(f"Phrase: '{phrase_display}'") 

356 print( 

357 f"Target Action Selected: {phrase_correct}/{iterations} ({success_rate:.1f}%)" 

358 ) 

359 if expected_params: 

360 print( 

361 f"Parameter Success: {parameter_correct}/{iterations} ({param_success_rate:.1f}%)" 

362 ) 

363 print(f"Expected params: {expected_params}") 

364 

365 # Print timing analysis 

366 print(f"Timing Analysis:") 

367 print( 

368 f" Average: {timing_stats.mean:.2f}s | Median: {timing_stats.median:.2f}s" 

369 ) 

370 print(f" Range: {timing_stats.min:.2f}s - {timing_stats.max:.2f}s") 

371 print(f" Performance: {timing_stats.performance_category}") 

372 print(f" Consistency: {timing_stats.consistency_score:.1f}/100") 

373 

374 if secondary_action_counts: 

375 print(f"Secondary actions triggered:") 

376 for action, count in secondary_action_counts.items(): 

377 print(f" - {action}: {count}/{iterations} times") 

378 print("-" * 40) 

379 

380 overall_success_rate = ( 

381 (total_correct / total_tests) * 100 if total_tests > 0 else 0 

382 ) 

383 

384 # Calculate overall timing statistics across all phrases 

385 all_times = [] 

386 for phrase_results in results.values(): 

387 all_times.extend(phrase_results["timing_stats"]["raw_times"]) 

388 overall_timing = TimingStats(all_times) 

389 

390 print( 

391 f"Overall Success Rate: {total_correct}/{total_tests} ({overall_success_rate:.1f}%)" 

392 ) 

393 print( 

394 f"Overall Timing: {overall_timing.mean:.2f}s avg, {overall_timing.performance_category}, {overall_timing.consistency_score:.1f}/100 consistency" 

395 ) 

396 

397 test_passed = overall_success_rate >= 60.0 

398 return test_passed, { 

399 "action_name": action_name, 

400 "action_description": action_info.get("description", "No description"), 

401 "total_correct": total_correct, 

402 "total_tests": total_tests, 

403 "success_rate": overall_success_rate, 

404 "phrase_results": results, 

405 "overall_timing_stats": overall_timing.to_dict(), 

406 } 

407 

408 def run_all_tests(self, iterations: int = 1) -> bool: 

409 """Run all vibe tests for all actions that have test phrases. 

410 

411 Args: 

412 iterations: Number of iterations per phrase 

413 

414 Returns: 

415 True if all tests passed, False otherwise 

416 """ 

417 print( 

418 f"🧪 Running vibe tests with multi-action support, timing analysis, and visual reporting" 

419 ) 

420 print(f"Chat model: {self.model}") 

421 if self.analysis_model != self.model: 

422 print(f"Analysis model: {self.analysis_model}") 

423 else: 

424 print("Using same model for analysis and chat") 

425 print(f"Analysis mode: Multi-action (target must be selected)") 

426 print(f"Iterations: {iterations}") 

427 print("=" * 80) 

428 

429 # Check prerequisites 

430 if not self.check_prerequisites(): 

431 return False 

432 

433 print(f"✅ Using chat model: {self.model}") 

434 if self.analysis_model != self.model: 

435 print(f"✅ Using analysis model: {self.analysis_model}") 

436 print( 

437 f"🧠 Testing AI's ability to select appropriate actions (multiple allowed)..." 

438 ) 

439 print(f"⏱️ Including timing analysis for performance insights...") 

440 print( 

441 f"📋 Found {len(self.actions_with_tests)} actions with vibe test phrases\n" 

442 ) 

443 

444 if not self.actions_with_tests: 

445 print("❌ No actions with vibe test phrases found!") 

446 return False 

447 

448 # Run tests for each action 

449 test_results = {} 

450 all_tests_passed = True 

451 overall_test_start = time.perf_counter() 

452 

453 for action_name, action_info in self.actions_with_tests.items(): 

454 test_phrases = action_info["vibe_test_phrases"] 

455 

456 if not test_phrases: 

457 print(f"⚠️ Skipping {action_name} - no test phrases defined") 

458 continue 

459 

460 test_passed, results = self.run_action_test( 

461 action_name, action_info, test_phrases, iterations 

462 ) 

463 

464 test_results[action_name] = {"passed": test_passed, "results": results} 

465 

466 if not test_passed: 

467 all_tests_passed = False 

468 

469 overall_test_time = time.perf_counter() - overall_test_start 

470 

471 # Store results for report generation 

472 self.all_test_results = test_results 

473 

474 # Generate and save the HTML report using the report generator 

475 report_generator = VibeTestReportGenerator(self.model, self.analysis_model) 

476 filename = report_generator.save_report(test_results) 

477 print(f"\n📊 Report saved to: {filename}") 

478 print( 

479 f" Open in your browser to view interactive charts with timing analysis" 

480 ) 

481 

482 # Final results summary with timing 

483 print(f"\n📊 Final Test Results:") 

484 print("=" * 50) 

485 

486 fastest_action = None 

487 slowest_action = None 

488 fastest_time = float("inf") 

489 slowest_time = 0.0 

490 

491 for action_name, test_data in test_results.items(): 

492 status_icon = "✅ PASSED" if test_data["passed"] else "❌ FAILED" 

493 success_rate = test_data["results"]["success_rate"] 

494 avg_time = test_data["results"]["overall_timing_stats"]["mean"] 

495 consistency = test_data["results"]["overall_timing_stats"][ 

496 "consistency_score" 

497 ] 

498 

499 print(f"{action_name} Action Test: {status_icon} ({success_rate:.1f}%)") 

500 print( 

501 f" Performance: {avg_time:.2f}s avg, {consistency:.1f}/100 consistency" 

502 ) 

503 

504 if avg_time < fastest_time: 

505 fastest_time = avg_time 

506 fastest_action = action_name 

507 if avg_time > slowest_time: 

508 slowest_time = avg_time 

509 slowest_action = action_name 

510 

511 status_icon = "✅" if all_tests_passed else "❌" 

512 status_text = "ALL TESTS PASSED" if all_tests_passed else "SOME TESTS FAILED" 

513 print(f"\nOverall Result: {status_icon} {status_text}") 

514 print(f"Total Test Duration: {overall_test_time:.2f}s") 

515 

516 if fastest_action and slowest_action: 

517 print( 

518 f"Performance Range: {fastest_action} ({fastest_time:.2f}s) → {slowest_action} ({slowest_time:.2f}s)" 

519 ) 

520 

521 if not all_tests_passed: 

522 print("\n💡 Tips for improving results:") 

523 print(" • Try a different model with --model") 

524 print(" • Try a different analysis model with --analysis-model") 

525 print(" • Use a smaller, faster model for analysis (e.g., gemma2:2b)") 

526 print(" • Increase iterations with -n for better statistics") 

527 print(" • Ensure Ollama server is running optimally") 

528 print(" • Check action descriptions and test phrases for clarity") 

529 

530 return all_tests_passed 

531 

532 def run_quick_test(self) -> bool: 

533 """Run a quick single-iteration test for fast feedback.""" 

534 print("🚀 Running quick vibe test (1 iteration each)...") 

535 return self.run_all_tests(iterations=1) 

536 

537 def run_statistical_test(self, iterations: int = 5) -> bool: 

538 """Run a statistical test with multiple iterations.""" 

539 print(f"📊 Running statistical vibe test ({iterations} iterations each)...") 

540 return self.run_all_tests(iterations=iterations) 

541 

542 

543def run_vibe_tests( 

544 model: str = "gemma3:4b", iterations: int = 1, analysis_model: str = None 

545) -> bool: 

546 """Convenience function to run vibe tests with timing analysis and visual reporting. 

547 

548 Args: 

549 model: The model to use for testing 

550 iterations: Number of iterations per test 

551 analysis_model: Optional separate model for action analysis (defaults to main model) 

552 

553 Returns: 

554 True if all tests passed, False otherwise 

555 """ 

556 runner = VibeTestRunner(model=model, analysis_model=analysis_model) 

557 return runner.run_all_tests(iterations=iterations)