Coverage for src/ollamapy/vibe

1"""Built-in vibe tests for evaluating AI decision-making consistency with timing analysis and visual reporting."""

3import re

4import time

5import statistics

6from typing import List, Dict, Tuple, Any

7from .ollama_client import OllamaClient

8from .model_manager import ModelManager

9from .analysis_engine import AnalysisEngine

10from .skills import get_actions_with_vibe_tests, clear_action_logs

11from .vibe_report import VibeTestReportGenerator

14class TimingStats:

15 """Container for timing statistics with helpful analysis methods."""

17 def __init__(self, times: List[float]):

18 """Initialize timing stats from a list of execution times.

20 Args:

21 times: List of execution times in seconds

22 """

23 self.times = times

24 self.count = len(times)

26 if times:

27 self.mean = statistics.mean(times)

28 self.median = statistics.median(times)

29 self.min = min(times)

30 self.max = max(times)

31 self.std_dev = statistics.stdev(times) if len(times) > 1 else 0.0

33 # Calculate percentiles for more insight

34 sorted_times = sorted(times)

35 self.p25 = self._percentile(sorted_times, 25)

36 self.p75 = self._percentile(sorted_times, 75)

37 self.p95 = self._percentile(sorted_times, 95)

38 else:

39 self.mean = self.median = self.min = self.max = self.std_dev = 0.0

40 self.p25 = self.p75 = self.p95 = 0.0

42 def _percentile(self, sorted_data: List[float], percentile: float) -> float:

43 """Calculate percentile from sorted data."""

44 if not sorted_data:

45 return 0.0

47 k = (len(sorted_data) - 1) * (percentile / 100.0)

48 f = int(k)

49 c = k - f

51 if f + 1 < len(sorted_data):

52 return sorted_data[f] + c * (sorted_data[f + 1] - sorted_data[f])

53 else:

54 return sorted_data[f]

56 @property

57 def consistency_score(self) -> float:

58 """Calculate a consistency score (0-100) based on timing variability.

60 Lower variance relative to mean indicates higher consistency.

62 Returns:

63 Consistency score from 0 (very inconsistent) to 100 (very consistent)

64 """

65 if self.mean == 0 or self.count < 2:

66 return 100.0

68 # Coefficient of variation (CV) = std_dev / mean

69 cv = self.std_dev / self.mean

71 # Convert CV to consistency score (lower CV = higher consistency)

72 # CV of 0.1 (10%) = 90 consistency, CV of 0.5 (50%) = 50 consistency

73 consistency = max(0, 100 - (cv * 200))

74 return min(100, consistency)

76 @property

77 def performance_category(self) -> str:

78 """Categorize performance based on mean response time."""

79 if self.mean < 1.0:

80 return "Very Fast"

81 elif self.mean < 2.0:

82 return "Fast"

83 elif self.mean < 5.0:

84 return "Moderate"

85 elif self.mean < 10.0:

86 return "Slow"

87 else:

88 return "Very Slow"

90 def to_dict(self) -> Dict[str, Any]:

91 """Convert timing stats to dictionary for serialization."""

92 return {

93 "count": self.count,

94 "mean": self.mean,

95 "median": self.median,

96 "min": self.min,

97 "max": self.max,

98 "std_dev": self.std_dev,

99 "p25": self.p25,

100 "p75": self.p75,

101 "p95": self.p95,

102 "consistency_score": self.consistency_score,

103 "performance_category": self.performance_category,

104 "raw_times": self.times,

105 }

106

107

108class VibeTestRunner:

109 """Built-in vibe test runner with multi-action support, timing analysis, and visual reporting.

110

111 Tests check if the target action is selected, regardless of what other

112 actions might also be selected, and generates comprehensive visual reports

113 including timing analysis.

114 """

115

116 def __init__(self, model: str = "gemma3:4b", analysis_model: str = "gemma3:4b"):

117 """Initialize the vibe test runner.

118

119 Args:

120 model: The model to use for testing

121 analysis_model: Optional separate model for action analysis (defaults to main model)

122 """

123 self.model = model

124 self.analysis_model = analysis_model or model

125 self.client = OllamaClient()

126 self.model_manager = ModelManager(self.client)

127 self.analysis_engine = AnalysisEngine(self.analysis_model, self.client)

128 self.actions_with_tests = get_actions_with_vibe_tests()

129 self.all_test_results = {} # Store all results for report generation

130

131 def check_prerequisites(self) -> bool:

132 """Check if Ollama is available and models can be used."""

133 success, main_status, analysis_status = (

134 self.model_manager.ensure_models_available(self.model, self.analysis_model)

135 )

136

137 if not success:

138 print("❌ Error: Ollama server is not running!")

139 print("Please start Ollama with: ollama serve")

140 return False

141

142 return True

143

144 def extract_expected_parameters(

145 self, phrase: str, action_name: str

146 ) -> Dict[str, Any]:

147 """Extract expected parameter values from test phrases.

148

149 Args:

150 phrase: The test phrase

151 action_name: The action being tested

152

153 Returns:

154 Dictionary of expected parameter values

155 """

156 expected_params = {}

157

158 # Extract numbers for square_root

159 if action_name == "square_root":

160 # Look for numbers in the phrase

161 numbers = re.findall(r"\d+(?:\.\d+)?", phrase)

162 if numbers:

163 expected_params["number"] = float(numbers[0])

164

165 # Extract expressions for calculate

166 elif action_name == "calculate":

167 # Look for mathematical expressions

168 # Simple pattern for basic arithmetic

169 expr_match = re.search(r"(\d+\s*[+\-*/]\s*\d+)", phrase)

170 if expr_match:

171 expected_params["expression"] = expr_match.group(1).replace(" ", "")

172

173 # Extract location for weather (if mentioned)

174 elif action_name == "getWeather":

175 # Look for common city names or location indicators

176 location_keywords = ["in", "at", "for"]

177 for keyword in location_keywords:

178 if keyword in phrase.lower():

179 parts = phrase.lower().split(keyword)

180 if len(parts) > 1:

181 potential_location_parts = parts[1].strip().split()

182 if potential_location_parts:

183 potential_location = potential_location_parts[0]

184 if len(potential_location) > 2:

185 expected_params["location"] = potential_location

186 break

187

188 return expected_params

189

190 def time_analysis_execution(

191 self, phrase: str

192 ) -> Tuple[List[Tuple[str, Dict[str, Any]]], float]:

193 """Time the execution of analysis engine action selection.

194

195 Args:

196 phrase: The phrase to analyze

197

198 Returns:

199 Tuple of (selected_actions, execution_time_seconds)

200 """

201 start_time = time.perf_counter()

202

203 try:

204 # Clear any previous logs

205 clear_action_logs()

206

207 # Run the multi-action analysis

208 selected_actions = self.analysis_engine.select_all_applicable_actions(

209 phrase

210 )

211

212 end_time = time.perf_counter()

213 execution_time = end_time - start_time

214

215 return selected_actions, execution_time

216

217 except Exception as e:

218 end_time = time.perf_counter()

219 execution_time = end_time - start_time

220 print(f"❌ Error during analysis timing: {e}")

221 return [], execution_time

222

223 def run_action_test(

224 self, action_name: str, action_info: Dict, phrases: List[str], iterations: int

225 ) -> Tuple[bool, Dict]:

226 """Run a test on a specific action with its phrases, including timing analysis.

227

228 Tests if the target action is selected (other actions may also be selected).

229

230 Args:

231 action_name: Name of the action being tested

232 action_info: Information about the action (description, etc.)

233 phrases: List of test phrases for this action

234 iterations: Number of times to test each phrase

235

236 Returns:

237 Tuple of (success: bool, results: dict)

238 """

239 total_correct = 0

240 total_tests = 0

241 results = {}

242

243 print(f"\n🧪 {action_name} Action Test")

244 print(f"Chat Model: {self.model}")

245 if self.analysis_model != self.model:

246 print(f"Analysis Model: {self.analysis_model}")

247 else:

248 print("Using same model for analysis and chat")

249 print("Mode: Multi-action selection (target action must be selected)")

250 print("=" * 80)

251

252 for phrase in phrases:

253 phrase_correct = 0

254 parameter_correct = 0

255 expected_params = self.extract_expected_parameters(phrase, action_name)

256

257 # Track secondary actions and timing per iteration for this phrase

258 secondary_actions_per_iteration = []

259 execution_times = []

260

261 for i in range(iterations):

262 try:

263 # Time the analysis execution

264 selected_actions, execution_time = self.time_analysis_execution(

265 phrase

266 )

267 execution_times.append(execution_time)

268

269 # Check if target action was selected and track secondary actions

270 action_found = False

271 params_match = False

272 iteration_secondary_actions = []

273

274 for selected_action, parameters in selected_actions:

275 if selected_action == action_name:

276 action_found = True

277 phrase_correct += 1

278

279 # Check parameters if expected

280 if expected_params:

281 params_match = True

282 for (

283 param_name,

284 expected_value,

285 ) in expected_params.items():

286 if param_name in parameters:

287 actual_value = parameters[param_name]

288 # For numbers, check if they're close enough

289 if isinstance(expected_value, (int, float)):

290 try:

291 actual_float = float(actual_value)

292 if (

293 abs(actual_float - expected_value)

294 < 0.001

295 ):

296 parameter_correct += 1

297 else:

298 params_match = False

299 except:

300 params_match = False

301 # For strings, check exact match

302 elif str(actual_value) == str(expected_value):

303 parameter_correct += 1

304 else:

305 params_match = False

306 else:

307 params_match = False

308 else:

309 # This is a secondary action

310 iteration_secondary_actions.append(selected_action)

311

312 secondary_actions_per_iteration.append(iteration_secondary_actions)

313 total_tests += 1

314

315 except Exception as e:

316 print(f"❌ Error testing phrase iteration {i+1}: {e}")

317 secondary_actions_per_iteration.append([])

318 # Still record the time even if there was an error

319 if len(execution_times) <= i:

320 execution_times.append(0.0)

321 continue

322

323 # Calculate secondary action frequencies

324 secondary_action_counts = {}

325 for iteration_actions in secondary_actions_per_iteration:

326 for action in iteration_actions:

327 secondary_action_counts[action] = (

328 secondary_action_counts.get(action, 0) + 1

329 )

330

331 # Calculate timing statistics

332 timing_stats = TimingStats(execution_times)

333

334 success_rate = (phrase_correct / iterations) * 100 if iterations > 0 else 0

335 param_success_rate = (

336 (parameter_correct / iterations) * 100

337 if iterations > 0 and expected_params

338 else 100

339 )

340

341 results[phrase] = {

342 "correct": phrase_correct,

343 "total": iterations,

344 "success_rate": success_rate,

345 "parameter_success_rate": param_success_rate,

346 "expected_params": expected_params,

347 "secondary_action_counts": secondary_action_counts,

348 "secondary_actions_per_iteration": secondary_actions_per_iteration,

349 "timing_stats": timing_stats.to_dict(),

350 }

351 total_correct += phrase_correct

352

353 # Print individual results with timing

354 phrase_display = phrase[:50] + "..." if len(phrase) > 50 else phrase

355 print(f"Phrase: '{phrase_display}'")

356 print(

357 f"Target Action Selected: {phrase_correct}/{iterations} ({success_rate:.1f}%)"

358 )

359 if expected_params:

360 print(

361 f"Parameter Success: {parameter_correct}/{iterations} ({param_success_rate:.1f}%)"

362 )

363 print(f"Expected params: {expected_params}")

364

365 # Print timing analysis

366 print(f"Timing Analysis:")

367 print(

368 f" Average: {timing_stats.mean:.2f}s | Median: {timing_stats.median:.2f}s"

369 )

370 print(f" Range: {timing_stats.min:.2f}s - {timing_stats.max:.2f}s")

371 print(f" Performance: {timing_stats.performance_category}")

372 print(f" Consistency: {timing_stats.consistency_score:.1f}/100")

373

374 if secondary_action_counts:

375 print(f"Secondary actions triggered:")

376 for action, count in secondary_action_counts.items():

377 print(f" - {action}: {count}/{iterations} times")

378 print("-" * 40)

379

380 overall_success_rate = (

381 (total_correct / total_tests) * 100 if total_tests > 0 else 0

382 )

383

384 # Calculate overall timing statistics across all phrases

385 all_times = []

386 for phrase_results in results.values():

387 all_times.extend(phrase_results["timing_stats"]["raw_times"])

388 overall_timing = TimingStats(all_times)

389

390 print(

391 f"Overall Success Rate: {total_correct}/{total_tests} ({overall_success_rate:.1f}%)"

392 )

393 print(

394 f"Overall Timing: {overall_timing.mean:.2f}s avg, {overall_timing.performance_category}, {overall_timing.consistency_score:.1f}/100 consistency"

395 )

396

397 test_passed = overall_success_rate >= 60.0

398 return test_passed, {

399 "action_name": action_name,

400 "action_description": action_info.get("description", "No description"),

401 "total_correct": total_correct,

402 "total_tests": total_tests,

403 "success_rate": overall_success_rate,

404 "phrase_results": results,

405 "overall_timing_stats": overall_timing.to_dict(),

406 }

407

408 def run_all_tests(self, iterations: int = 1) -> bool:

409 """Run all vibe tests for all actions that have test phrases.

410

411 Args:

412 iterations: Number of iterations per phrase

413

414 Returns:

415 True if all tests passed, False otherwise

416 """

417 print(

418 f"🧪 Running vibe tests with multi-action support, timing analysis, and visual reporting"

419 )

420 print(f"Chat model: {self.model}")

421 if self.analysis_model != self.model:

422 print(f"Analysis model: {self.analysis_model}")

423 else:

424 print("Using same model for analysis and chat")

425 print(f"Analysis mode: Multi-action (target must be selected)")

426 print(f"Iterations: {iterations}")

427 print("=" * 80)

428

429 # Check prerequisites

430 if not self.check_prerequisites():

431 return False

432

433 print(f"✅ Using chat model: {self.model}")

434 if self.analysis_model != self.model:

435 print(f"✅ Using analysis model: {self.analysis_model}")

436 print(

437 f"🧠 Testing AI's ability to select appropriate actions (multiple allowed)..."

438 )

439 print(f"⏱️ Including timing analysis for performance insights...")

440 print(

441 f"📋 Found {len(self.actions_with_tests)} actions with vibe test phrases\n"

442 )

443

444 if not self.actions_with_tests:

445 print("❌ No actions with vibe test phrases found!")

446 return False

447

448 # Run tests for each action

449 test_results = {}

450 all_tests_passed = True

451 overall_test_start = time.perf_counter()

452

453 for action_name, action_info in self.actions_with_tests.items():

454 test_phrases = action_info["vibe_test_phrases"]

455

456 if not test_phrases:

457 print(f"⚠️ Skipping {action_name} - no test phrases defined")

458 continue

459

460 test_passed, results = self.run_action_test(

461 action_name, action_info, test_phrases, iterations

462 )

463

464 test_results[action_name] = {"passed": test_passed, "results": results}

465

466 if not test_passed:

467 all_tests_passed = False

468

469 overall_test_time = time.perf_counter() - overall_test_start

470

471 # Store results for report generation

472 self.all_test_results = test_results

473

474 # Generate and save the HTML report using the report generator

475 report_generator = VibeTestReportGenerator(self.model, self.analysis_model)

476 filename = report_generator.save_report(test_results)

477 print(f"\n📊 Report saved to: {filename}")

478 print(

479 f" Open in your browser to view interactive charts with timing analysis"

480 )

481

482 # Final results summary with timing

483 print(f"\n📊 Final Test Results:")

484 print("=" * 50)

485

486 fastest_action = None

487 slowest_action = None

488 fastest_time = float("inf")

489 slowest_time = 0.0

490

491 for action_name, test_data in test_results.items():

492 status_icon = "✅ PASSED" if test_data["passed"] else "❌ FAILED"

493 success_rate = test_data["results"]["success_rate"]

494 avg_time = test_data["results"]["overall_timing_stats"]["mean"]

495 consistency = test_data["results"]["overall_timing_stats"][

496 "consistency_score"

497 ]

498

499 print(f"{action_name} Action Test: {status_icon} ({success_rate:.1f}%)")

500 print(

501 f" Performance: {avg_time:.2f}s avg, {consistency:.1f}/100 consistency"

502 )

503

504 if avg_time < fastest_time:

505 fastest_time = avg_time

506 fastest_action = action_name

507 if avg_time > slowest_time:

508 slowest_time = avg_time

509 slowest_action = action_name

510

511 status_icon = "✅" if all_tests_passed else "❌"

512 status_text = "ALL TESTS PASSED" if all_tests_passed else "SOME TESTS FAILED"

513 print(f"\nOverall Result: {status_icon} {status_text}")

514 print(f"Total Test Duration: {overall_test_time:.2f}s")

515

516 if fastest_action and slowest_action:

517 print(

518 f"Performance Range: {fastest_action} ({fastest_time:.2f}s) → {slowest_action} ({slowest_time:.2f}s)"

519 )

520

521 if not all_tests_passed:

522 print("\n💡 Tips for improving results:")

523 print(" • Try a different model with --model")

524 print(" • Try a different analysis model with --analysis-model")

525 print(" • Use a smaller, faster model for analysis (e.g., gemma2:2b)")

526 print(" • Increase iterations with -n for better statistics")

527 print(" • Ensure Ollama server is running optimally")

528 print(" • Check action descriptions and test phrases for clarity")

529

530 return all_tests_passed

531

532 def run_quick_test(self) -> bool:

533 """Run a quick single-iteration test for fast feedback."""

534 print("🚀 Running quick vibe test (1 iteration each)...")

535 return self.run_all_tests(iterations=1)

536

537 def run_statistical_test(self, iterations: int = 5) -> bool:

538 """Run a statistical test with multiple iterations."""

539 print(f"📊 Running statistical vibe test ({iterations} iterations each)...")

540 return self.run_all_tests(iterations=iterations)

541

542

543def run_vibe_tests(

544 model: str = "gemma3:4b", iterations: int = 1, analysis_model: str = None

545) -> bool:

546 """Convenience function to run vibe tests with timing analysis and visual reporting.

547

548 Args:

549 model: The model to use for testing

550 iterations: Number of iterations per test

551 analysis_model: Optional separate model for action analysis (defaults to main model)

552

553 Returns:

554 True if all tests passed, False otherwise

555 """

556 runner = VibeTestRunner(model=model, analysis_model=analysis_model)

557 return runner.run_all_tests(iterations=iterations)

Coverage for src/ollamapy/vibe_tests.py: 12%

275 statements