diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 816ff856..0e2a6683 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -14,4 +14,5 @@ - [Evandro Franco](https://github.com/evandrofranco) - [Sanghwa Na](https://github.com/didhd) - [Neelam Koshiya](https://github.com/neelamkoshiya) -- [Asif Mithawala](https://github.com/asifma) \ No newline at end of file +- [Asif Mithawala](https://github.com/asifma) +- [Madhu Nunna](https://github.com/madhununna) \ No newline at end of file diff --git a/python/06-evaluate/dataset-generation/03-dataset-generation.ipynb b/python/06-evaluate/dataset-generation/03-dataset-generation.ipynb index 7a83da46..94225cd9 100644 --- a/python/06-evaluate/dataset-generation/03-dataset-generation.ipynb +++ b/python/06-evaluate/dataset-generation/03-dataset-generation.ipynb @@ -55,14 +55,14 @@ "|:---------|:-------|:---------|\n", "| From Scratch | `from_scratch_async()` | New agents, broad coverage, exploratory testing |\n", "| From Context | `from_context_async()` | Testing specific tools, API integration scenarios |\n", - "| Update Existing | `update_current_dataset_async()` | Adding edge cases, iterative improvement |\n", + "| Update Existing | `update_current_experiment_async()` | Adding edge cases, iterative improvement |\n", "\n", "#### Dataset Persistence\n", "\n", "| Operation | Method | Use Case |\n", "|:----------|:-------|:---------|\n", "| Save | `dataset.to_file('name.json')` | Preserve for reuse, version control |\n", - "| Load | `Dataset.from_file('name.json')` | Consistent evaluation, team sharing |" + "| Load | `Experiment.from_file('name.json')` | Consistent evaluation, team sharing |" ] }, { @@ -112,8 +112,8 @@ "from strands.multiagent import GraphBuilder\n", "\n", "# Strands Evals imports\n", - "from strands_evals import Dataset, Case\n", - "from strands_evals.generators import DatasetGenerator\n", + "from strands_evals import Experiment, Case\n", + "from strands_evals.generators import ExperimentGenerator\n", "from strands_evals.evaluators import OutputEvaluator\n", "\n", "# Display utilities\n", @@ -179,7 +179,7 @@ "outputs": [], "source": [ "# Initialize dataset generator\n", - "generator = DatasetGenerator(\n", + "generator = ExperimentGenerator(\n", " input_type=str,\n", " output_type=str,\n", " include_expected_output=True,\n", @@ -350,7 +350,7 @@ "source": [ "### Strategy 3: Update Existing Dataset with Edge Cases\n", "\n", - "The `update_current_dataset_async()` method extends an existing dataset by adding new test cases. This is ideal for iteratively improving test coverage by adding edge cases, corner scenarios, or addressing gaps discovered in production.\n", + "The `update_current_experiment_async()` method extends an existing dataset by adding new test cases. This is ideal for iteratively improving test coverage by adding edge cases, corner scenarios, or addressing gaps discovered in production.\n", "\n", "#### Key Features\n", "- **Incremental improvement**: Add tests without starting from scratch\n", @@ -375,7 +375,7 @@ "outputs": [], "source": [ "# Load existing dataset from JSON\n", - "loaded_dataset = Dataset.from_file('scratch_dataset.json')" + "loaded_dataset = Experiment.from_file('scratch_dataset.json')" ] }, { @@ -396,7 +396,7 @@ "\"\"\"\n", "\n", "# Update dataset by adding edge cases\n", - "updated_dataset = await generator.update_current_dataset_async(\n", + "updated_dataset = await generator.update_current_experiment_async(\n", " source_dataset=loaded_dataset,\n", " task_description=\"Multi-agent decision system handling complex and edge case scenarios\",\n", " num_cases=6,\n", @@ -407,7 +407,7 @@ "print(f\"\\nOriginal dataset: {len(loaded_dataset.cases)} cases\")\n", "print(f\"Updated dataset: {len(updated_dataset.cases)} cases\")\n", "print(f\"New cases added: {len(updated_dataset.cases) - len(loaded_dataset.cases)}\")\n", - "print(f\"\\nUpdated rubric: {updated_dataset.evaluator.rubric}\")" + "print(f\"\\nUpdated rubric: {updated_dataset.evaluators[0].rubric}\")" ] }, { @@ -483,9 +483,9 @@ "outputs": [], "source": [ "# Use first 3 cases for demonstration\n", - "eval_dataset = Dataset(\n", + "eval_dataset = Experiment(\n", " cases=context_dataset.cases[:3],\n", - " evaluator=context_dataset.evaluator\n", + " evaluators=[context_dataset.evaluators[0]]\n", ")\n", "\n", "report = eval_dataset.run_evaluations(agent_task)" @@ -544,7 +544,7 @@ "- Use case: Context-aware testing\n", "\n", "**3. updated_dataset.json**\n", - "- Strategy: update_current_dataset_async()\n", + "- Strategy: update_current_experiment_async()\n", "- Source: scratch_dataset.json + edge cases\n", "- Test cases: 15 (original 9 + 6 new)\n", "- Use case: Iterative improvement with edge cases\n", @@ -553,7 +553,7 @@ "\n", "```python\n", "# Load any saved dataset\n", - "dataset = Dataset.from_file('dataset_name.json')\n", + "dataset = Experiment.from_file('dataset_name.json')\n", "\n", "# Run evaluation\n", "report = dataset.run_evaluations(agent_task)\n", @@ -583,7 +583,7 @@ "|:---------|:---------|\n", "| `from_scratch_async()` | Starting new project, need broad coverage, no detailed context yet |\n", "| `from_context_async()` | Have well-defined tools/APIs, need tests matching actual capabilities |\n", - "| `update_current_dataset_async()` | Improving existing dataset, discovered gaps, adding edge cases |\n", + "| `update_current_experiment_async()` | Improving existing dataset, discovered gaps, adding edge cases |\n", "\n", "### Key Recommendations\n", "\n", @@ -605,9 +605,9 @@ "\n", "- How to generate test cases from scratch using topics with `from_scratch_async()`\n", "- How to generate contextual test cases from agent capabilities with `from_context_async()`\n", - "- How to update existing datasets with edge cases using `update_current_dataset_async()`\n", + "- How to update existing datasets with edge cases using `update_current_experiment_async()`\n", "- How to save datasets to JSON files with `dataset.to_file()`\n", - "- How to load datasets from JSON files with `Dataset.from_file()`\n", + "- How to load datasets from JSON files with `Experiment.from_file()`\n", "- How to use auto-rubric generation for evaluators\n", "- How to apply topic planning for diverse test coverage\n", "- Best practices for choosing generation strategies\n", diff --git a/python/06-evaluate/multi-agent-evaluation/06-multi-agent-evaluation.ipynb b/python/06-evaluate/multi-agent-evaluation/06-multi-agent-evaluation.ipynb index 70381476..0b5f0d05 100644 --- a/python/06-evaluate/multi-agent-evaluation/06-multi-agent-evaluation.ipynb +++ b/python/06-evaluate/multi-agent-evaluation/06-multi-agent-evaluation.ipynb @@ -120,7 +120,7 @@ "from strands import Agent, tool\n", "\n", "# Strands Evals imports\n", - "from strands_evals import Dataset, Case\n", + "from strands_evals import Experiment, Case\n", "from strands_evals.evaluators import OutputEvaluator, ToolSelectionAccuracyEvaluator, InteractionsEvaluator\n", "from strands_evals.extractors import tools_use_extractor\n", "from strands_evals.types import Interaction\n", @@ -582,44 +582,44 @@ " return {\"output\": str(response), \"trajectory\": session}\n", "\n", "# Create 8 datasets (4 agents x 2 evaluators each)\n", - "tech_output_dataset = Dataset(\n", + "tech_output_dataset = Experiment(\n", " cases=[individual_test_cases[0]],\n", - " evaluator=output_evaluator\n", + " evaluators=[output_evaluator]\n", ")\n", "\n", - "tech_tool_dataset = Dataset(\n", + "tech_tool_dataset = Experiment(\n", " cases=[individual_test_cases[0]],\n", - " evaluator=tool_selection_evaluator\n", + " evaluators=[tool_selection_evaluator]\n", ")\n", "\n", - "billing_output_dataset = Dataset(\n", + "billing_output_dataset = Experiment(\n", " cases=[individual_test_cases[1]],\n", - " evaluator=output_evaluator\n", + " evaluators=[output_evaluator]\n", ")\n", "\n", - "billing_tool_dataset = Dataset(\n", + "billing_tool_dataset = Experiment(\n", " cases=[individual_test_cases[1]],\n", - " evaluator=tool_selection_evaluator\n", + " evaluators=[tool_selection_evaluator]\n", ")\n", "\n", - "product_output_dataset = Dataset(\n", + "product_output_dataset = Experiment(\n", " cases=[individual_test_cases[2]],\n", - " evaluator=output_evaluator\n", + " evaluators=[output_evaluator]\n", ")\n", "\n", - "product_tool_dataset = Dataset(\n", + "product_tool_dataset = Experiment(\n", " cases=[individual_test_cases[2]],\n", - " evaluator=tool_selection_evaluator\n", + " evaluators=[tool_selection_evaluator]\n", ")\n", "\n", - "returns_output_dataset = Dataset(\n", + "returns_output_dataset = Experiment(\n", " cases=[individual_test_cases[3]],\n", - " evaluator=output_evaluator\n", + " evaluators=[output_evaluator]\n", ")\n", "\n", - "returns_tool_dataset = Dataset(\n", + "returns_tool_dataset = Experiment(\n", " cases=[individual_test_cases[3]],\n", - " evaluator=tool_selection_evaluator\n", + " evaluators=[tool_selection_evaluator]\n", ")\n", "\n", "print(\"Evaluating individual agent performance...\\n\")\n", @@ -779,9 +779,9 @@ " return str(response)\n", "\n", "# Create dataset for system evaluation\n", - "system_dataset = Dataset(\n", + "system_dataset = Experiment(\n", " cases=system_test_cases,\n", - " evaluator=system_output_evaluator\n", + " evaluators=[system_output_evaluator]\n", ")\n", "\n", "print(\"Evaluating complete system performance...\\n\")\n", @@ -954,9 +954,9 @@ "outputs": [], "source": [ "# Create dataset for coordination evaluation\n", - "coordination_dataset = Dataset(\n", + "coordination_dataset = Experiment(\n", " cases=system_test_cases,\n", - " evaluator=interaction_evaluator\n", + " evaluators=[interaction_evaluator]\n", ")\n", "\n", "print(\"Evaluating agent coordination quality...\\n\")\n", @@ -1200,4 +1200,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/python/06-evaluate/multi-turn-actor-simulator/05-multi-turn-actor-simulator.ipynb b/python/06-evaluate/multi-turn-actor-simulator/05-multi-turn-actor-simulator.ipynb index 0cadb327..f1c33a71 100644 --- a/python/06-evaluate/multi-turn-actor-simulator/05-multi-turn-actor-simulator.ipynb +++ b/python/06-evaluate/multi-turn-actor-simulator/05-multi-turn-actor-simulator.ipynb @@ -92,7 +92,7 @@ "from strands.models import BedrockModel\n", "\n", "# Strands Evals imports\n", - "from strands_evals import Dataset, Case\n", + "from strands_evals import Experiment, Case\n", "from strands_evals.simulation import ActorSimulator\n", "\n", "# Display utilities\n", @@ -677,7 +677,7 @@ " )\n", "]\n", "\n", - "dataset = Dataset(\n", + "dataset = Experiment(\n", " cases=evaluation_cases\n", ")" ] diff --git a/python/07-ux-demos/streamlit-template/docker_app/app_streaming.py b/python/07-ux-demos/streamlit-template/docker_app/app_streaming.py index b6cd1fc3..39e5f311 100644 --- a/python/07-ux-demos/streamlit-template/docker_app/app_streaming.py +++ b/python/07-ux-demos/streamlit-template/docker_app/app_streaming.py @@ -15,6 +15,10 @@ if "messages" not in st.session_state: st.session_state.messages = [] +# Initialize details placeholder +if "details_placeholder" not in st.session_state: + st.session_state.details_placeholder = st.empty() + # ID of Secrets Manager containing cognito parameters secrets_manager_id = Config.SECRETS_MANAGER_ID