|
| 1 | +import asyncio |
| 2 | +import json |
| 3 | +import re |
| 4 | +import traceback |
| 5 | +import warnings |
| 6 | +from typing import Any, List |
| 7 | + |
| 8 | +from transformers import AutoTokenizer |
| 9 | + |
| 10 | +from lagent.actions import AsyncActionMixin, BaseAction |
| 11 | +from lagent.schema import ActionStatusCode, ActionValidCode, AgentMessage |
| 12 | +from lagent.utils import create_object |
| 13 | + |
| 14 | + |
| 15 | +def extract_last_json(text: str) -> dict | None: |
| 16 | + """ |
| 17 | + Extracts the last valid JSON object from a string. |
| 18 | + Handles Markdown code blocks (```json ... ```) and raw JSON strings. |
| 19 | + """ |
| 20 | + try: |
| 21 | + # 1. Try to find JSON within Markdown code blocks first |
| 22 | + # Look for ```json ... ``` or just ``` ... ``` |
| 23 | + code_block_pattern = re.compile(r'```(?:json)?\s*(\{.*?\})\s*```', re.DOTALL) |
| 24 | + matches = code_block_pattern.findall(text) |
| 25 | + if matches: |
| 26 | + return json.loads(matches[-1]) |
| 27 | + |
| 28 | + # 2. If no code blocks, try to find the last outermost pair of braces |
| 29 | + # This regex looks for { ... } lazily but we want the last one. |
| 30 | + # A simple approach for nested JSON is tricky with regex, |
| 31 | + # so we scan from right to left for the last '}' and find its matching '{'. |
| 32 | + |
| 33 | + stack, end_idx = 0, -1 |
| 34 | + # Reverse search to find the last valid JSON structure |
| 35 | + for i in range(len(text) - 1, -1, -1): |
| 36 | + char = text[i] |
| 37 | + if char == '}': |
| 38 | + if stack == 0: |
| 39 | + end_idx = i |
| 40 | + stack += 1 |
| 41 | + elif char == '{': |
| 42 | + if stack > 0: |
| 43 | + stack -= 1 |
| 44 | + if stack == 0 and end_idx != -1: |
| 45 | + # Found a potential outermost JSON object |
| 46 | + candidate = text[i : end_idx + 1] |
| 47 | + try: |
| 48 | + return json.loads(candidate) |
| 49 | + except json.JSONDecodeError: |
| 50 | + # If this chunk isn't valid, reset and keep searching backwards |
| 51 | + # (or you might decide to stop here depending on strictness) |
| 52 | + stack, end_idx = 0, -1 |
| 53 | + return None |
| 54 | + except Exception: |
| 55 | + return None |
| 56 | + |
| 57 | + |
| 58 | +class WebVisitor(AsyncActionMixin, BaseAction): |
| 59 | + |
| 60 | + EXTRACTION_PROMPT = """Please process the following webpage content and user goal to extract relevant information: |
| 61 | +
|
| 62 | +## **Webpage Content** |
| 63 | +{webpage_content} |
| 64 | +
|
| 65 | +## **User Goal** |
| 66 | +{goal} |
| 67 | +
|
| 68 | +## **Task Guidelines** |
| 69 | +1. **Content Scanning for Rationale**: Locate the **specific sections/data** directly related to the user's goal within the webpage content |
| 70 | +2. **Key Extraction for Evidence**: Identify and extract the **most relevant information** from the content, you never miss any important information, output the **full original context** of the content as far as possible, it can be more than three paragraphs. |
| 71 | +3. **Summary Output for Summary**: Organize into a concise paragraph with logical flow, prioritizing clarity and judge the contribution of the information to the goal. |
| 72 | +
|
| 73 | +**Final Output Format using JSON format has "rational", "evidence", "summary" feilds** |
| 74 | +""" |
| 75 | + |
| 76 | + def __init__( |
| 77 | + self, |
| 78 | + browse_tool: BaseAction | dict, |
| 79 | + llm: Any, |
| 80 | + max_browse_attempts: int = 3, |
| 81 | + max_extract_attempts: int = 3, |
| 82 | + sleep_interval: int = 3, |
| 83 | + truncate_browse_response_length: int | None = None, |
| 84 | + tokenizer_path: str | None = None, |
| 85 | + name: str = 'visit', |
| 86 | + ): |
| 87 | + super().__init__( |
| 88 | + description={ |
| 89 | + 'name': name, |
| 90 | + 'description': 'Visit webpage(s) and return the summary of the content.', |
| 91 | + 'parameters': [ |
| 92 | + { |
| 93 | + 'name': 'url', |
| 94 | + 'type': ['STRING', 'ARRAY'], |
| 95 | + "items": {"type": "string"}, |
| 96 | + "minItems": 1, |
| 97 | + 'description': 'The URL(s) of the webpage(s) to visit. Can be a single URL or an array of URLs.', |
| 98 | + }, |
| 99 | + {'name': 'goal', 'type': 'STRING', 'description': 'The goal of the visit for webpage(s).'}, |
| 100 | + ], |
| 101 | + 'required': ['url', 'goal'], |
| 102 | + } |
| 103 | + ) |
| 104 | + browse_tool = create_object(browse_tool) |
| 105 | + assert not browse_tool.is_toolkit and browse_tool.description['required'] == [ |
| 106 | + 'url' |
| 107 | + ], "browse_tool must be a single-tool action with only 'url' as required argument." |
| 108 | + self.browse_tool = browse_tool |
| 109 | + self.llm = create_object(llm) |
| 110 | + self.max_browse_attempts = max_browse_attempts |
| 111 | + self.max_extract_attempts = max_extract_attempts |
| 112 | + self.sleep_interval = sleep_interval |
| 113 | + self.truncate_browse_response_length = truncate_browse_response_length |
| 114 | + self.tokenizer = ( |
| 115 | + AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True) if tokenizer_path else None |
| 116 | + ) |
| 117 | + if self.truncate_browse_response_length is not None and self.tokenizer is None: |
| 118 | + warnings.warn( |
| 119 | + 'truncate_browse_response_length is set but tokenizer_path is not provided. ' |
| 120 | + 'The raw webpage content will be truncated by characters instead of tokens.' |
| 121 | + ) |
| 122 | + |
| 123 | + async def run(self, url: str | List[str], goal: str) -> str: |
| 124 | + if isinstance(url, str): |
| 125 | + url = [url] |
| 126 | + |
| 127 | + async def _inner_call(single_url: str) -> str: |
| 128 | + try: |
| 129 | + return await self._read_webpage(single_url, goal) |
| 130 | + except Exception as e: |
| 131 | + return f"Error fetching {single_url}: {str(e)}" |
| 132 | + |
| 133 | + response = await asyncio.gather(*[_inner_call(single_url) for single_url in url]) |
| 134 | + return "\n=======\n".join(response).strip() |
| 135 | + |
| 136 | + async def _read_webpage(self, url: str, goal: str) -> str: |
| 137 | + tool_response = compressed = None |
| 138 | + return_template = ( |
| 139 | + f"The useful information in {url} for user goal {goal} as follows: \n\n" |
| 140 | + f"Evidence in page: \n{{evidence}}\n\nSummary: \n{{summary}}\n\n" |
| 141 | + ) |
| 142 | + for _ in range(self.max_browse_attempts): |
| 143 | + resp = await self.browse_tool({'url': url}) |
| 144 | + if resp.valid == ActionValidCode.OPEN and resp.state == ActionStatusCode.SUCCESS: |
| 145 | + tool_response = resp.format_result() |
| 146 | + break |
| 147 | + await asyncio.sleep(self.sleep_interval) |
| 148 | + else: |
| 149 | + return return_template.format( |
| 150 | + evidence="The provided webpage content could not be accessed. Please check the URL or file format.", |
| 151 | + summary="The webpage content could not be processed, and therefore, no information is available.", |
| 152 | + ) |
| 153 | + |
| 154 | + if self.truncate_browse_response_length is not None: |
| 155 | + tool_response = ( |
| 156 | + self.tokenizer.decode( |
| 157 | + self.tokenizer.encode( |
| 158 | + tool_response, |
| 159 | + max_length=self.truncate_browse_response_length, |
| 160 | + truncation=True, |
| 161 | + add_special_tokens=False, |
| 162 | + ) |
| 163 | + ) |
| 164 | + if self.tokenizer is not None |
| 165 | + else tool_response[: self.truncate_browse_response_length] |
| 166 | + ) |
| 167 | + |
| 168 | + for _ in range(self.max_extract_attempts): |
| 169 | + try: |
| 170 | + prompt = self.EXTRACTION_PROMPT.format(webpage_content=tool_response, goal=goal) |
| 171 | + llm_response = await self.llm.chat([{'role': 'user', 'content': prompt}]) |
| 172 | + if llm_response and not isinstance(llm_response, str): |
| 173 | + llm_response = ( |
| 174 | + llm_response.content |
| 175 | + if isinstance(llm_response, AgentMessage) |
| 176 | + else llm_response.choices[0].message.content |
| 177 | + ) |
| 178 | + if not llm_response or len(llm_response) < 10: |
| 179 | + tool_response = tool_response[: int(len(tool_response) * 0.7)] |
| 180 | + continue |
| 181 | + compressed = extract_last_json(llm_response) |
| 182 | + if isinstance(compressed, dict) and all( |
| 183 | + key in compressed for key in ['rational', 'evidence', 'summary'] |
| 184 | + ): |
| 185 | + break |
| 186 | + except Exception: |
| 187 | + print(f"Error in extracting information: {traceback.format_exc()}") |
| 188 | + await asyncio.sleep(self.sleep_interval) |
| 189 | + else: |
| 190 | + return return_template.format( |
| 191 | + evidence="Failed to extract relevant information from the webpage content.", |
| 192 | + summary="The webpage content could not be processed, and therefore, no information is available.", |
| 193 | + ) |
| 194 | + return return_template.format(evidence=compressed['evidence'], summary=compressed['summary']) |
0 commit comments