diff --git a/docs/examples/japanese_extraction.md b/docs/examples/japanese_extraction.md index fd4d26e6..a92a9c3a 100644 --- a/docs/examples/japanese_extraction.md +++ b/docs/examples/japanese_extraction.md @@ -51,7 +51,7 @@ for entity in result.extractions: if entity.char_interval: start, end = entity.char_interval.start_pos, entity.char_interval.end_pos position_info = f" (pos: {start}-{end})" - + print(f"• {entity.extraction_class}: {entity.extraction_text}{position_info}") # Expected Output: diff --git a/langextract/core/format_handler.py b/langextract/core/format_handler.py index 40993c24..477356e7 100644 --- a/langextract/core/format_handler.py +++ b/langextract/core/format_handler.py @@ -288,8 +288,31 @@ def _extract_content(self, text: str) -> str: FormatParseError: When fences required but not found or multiple blocks found. """ + strip_text = text.strip() if not self.use_fences: - return text.strip() + matches = list(_FENCE_RE.finditer(text)) + + if matches: + valid_tags = { + data.FormatType.YAML: {_YAML_FORMAT, _YML_FORMAT}, + data.FormatType.JSON: {_JSON_FORMAT}, + } + candidates = [ + m + for m in matches + if self._is_valid_language_tag(m.group("lang"), valid_tags) + ] + + if len(candidates) == 1: + return candidates[0].group("body").strip() + if len(candidates) > 1: + raise exceptions.FormatParseError( + "Multiple fenced blocks found. Expected exactly one." + ) + if not self.strict_fences and len(matches) == 1: + return matches[0].group("body").strip() + + return strip_text matches = list(_FENCE_RE.finditer(text)) @@ -330,7 +353,7 @@ def _extract_content(self, text: str) -> str: f"No {self.format_type.value} code block found." ) - return text.strip() + return strip_text # ---- Backward compatibility methods (to be removed in v2.0.0) ---- diff --git a/tests/format_handler_test.py b/tests/format_handler_test.py index 01e42508..537adce0 100644 --- a/tests/format_handler_test.py +++ b/tests/format_handler_test.py @@ -273,6 +273,24 @@ def test_think_tags_stripped_before_parsing(self): self.assertLen(parsed, 1) self.assertEqual(parsed[0]["person"], "Alice") + def test_fenced_json_accepted_when_fences_disabled(self): + # Some OpenAI-compatible backends may return fenced JSON even when raw + # JSON mode is expected. + handler = format_handler.FormatHandler( + format_type=data.FormatType.JSON, + use_wrapper=True, + wrapper_key="extractions", + use_fences=False, + ) + fenced_json = textwrap.dedent(""" + ```json + {"extractions": [{"person": "Alice"}]} + ``` + """).strip() + parsed = handler.parse_output(fenced_json) + self.assertLen(parsed, 1) + self.assertEqual(parsed[0]["person"], "Alice") + def test_top_level_list_accepted_as_fallback(self): # Some models return [...] instead of {"extractions": [...]} handler = format_handler.FormatHandler(