From ec9c13911d3a208d39bf60c9bfb9ba8c852e92b0 Mon Sep 17 00:00:00 2001 From: Samay10 Date: Wed, 15 Apr 2026 23:09:05 +0530 Subject: [PATCH 1/3] fix: accept fenced JSON output in raw JSON mode for OpenAI-compatible backends --- langextract/core/format_handler.py | 22 ++++++++++++++++++++++ tests/format_handler_test.py | 18 ++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/langextract/core/format_handler.py b/langextract/core/format_handler.py index 40993c24..77ce47cd 100644 --- a/langextract/core/format_handler.py +++ b/langextract/core/format_handler.py @@ -289,6 +289,28 @@ def _extract_content(self, text: str) -> str: blocks found. """ if not self.use_fences: + matches = list(_FENCE_RE.finditer(text)) + + if matches: + valid_tags = { + data.FormatType.YAML: {_YAML_FORMAT, _YML_FORMAT}, + data.FormatType.JSON: {_JSON_FORMAT}, + } + candidates = [ + m + for m in matches + if self._is_valid_language_tag(m.group('lang'), valid_tags) + ] + + if len(candidates) == 1: + return candidates[0].group('body').strip() + if len(candidates) > 1: + raise exceptions.FormatParseError( + 'Multiple fenced blocks found. Expected exactly one.' + ) + if not self.strict_fences and len(matches) == 1: + return matches[0].group('body').strip() + return text.strip() matches = list(_FENCE_RE.finditer(text)) diff --git a/tests/format_handler_test.py b/tests/format_handler_test.py index 01e42508..537adce0 100644 --- a/tests/format_handler_test.py +++ b/tests/format_handler_test.py @@ -273,6 +273,24 @@ def test_think_tags_stripped_before_parsing(self): self.assertLen(parsed, 1) self.assertEqual(parsed[0]["person"], "Alice") + def test_fenced_json_accepted_when_fences_disabled(self): + # Some OpenAI-compatible backends may return fenced JSON even when raw + # JSON mode is expected. + handler = format_handler.FormatHandler( + format_type=data.FormatType.JSON, + use_wrapper=True, + wrapper_key="extractions", + use_fences=False, + ) + fenced_json = textwrap.dedent(""" + ```json + {"extractions": [{"person": "Alice"}]} + ``` + """).strip() + parsed = handler.parse_output(fenced_json) + self.assertLen(parsed, 1) + self.assertEqual(parsed[0]["person"], "Alice") + def test_top_level_list_accepted_as_fallback(self): # Some models return [...] instead of {"extractions": [...]} handler = format_handler.FormatHandler( From b5d6af7136c22e4cc369bb77af7cf701e4eb964e Mon Sep 17 00:00:00 2001 From: Samay10 Date: Wed, 15 Apr 2026 23:19:00 +0530 Subject: [PATCH 2/3] chore: apply formatting fixes to match CI checks --- docs/examples/japanese_extraction.md | 2 +- langextract/core/format_handler.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/examples/japanese_extraction.md b/docs/examples/japanese_extraction.md index fd4d26e6..a92a9c3a 100644 --- a/docs/examples/japanese_extraction.md +++ b/docs/examples/japanese_extraction.md @@ -51,7 +51,7 @@ for entity in result.extractions: if entity.char_interval: start, end = entity.char_interval.start_pos, entity.char_interval.end_pos position_info = f" (pos: {start}-{end})" - + print(f"• {entity.extraction_class}: {entity.extraction_text}{position_info}") # Expected Output: diff --git a/langextract/core/format_handler.py b/langextract/core/format_handler.py index 77ce47cd..d41c7482 100644 --- a/langextract/core/format_handler.py +++ b/langextract/core/format_handler.py @@ -299,17 +299,17 @@ def _extract_content(self, text: str) -> str: candidates = [ m for m in matches - if self._is_valid_language_tag(m.group('lang'), valid_tags) + if self._is_valid_language_tag(m.group("lang"), valid_tags) ] if len(candidates) == 1: - return candidates[0].group('body').strip() + return candidates[0].group("body").strip() if len(candidates) > 1: raise exceptions.FormatParseError( - 'Multiple fenced blocks found. Expected exactly one.' + "Multiple fenced blocks found. Expected exactly one." ) if not self.strict_fences and len(matches) == 1: - return matches[0].group('body').strip() + return matches[0].group("body").strip() return text.strip() From ace8d09840b4a282923fa86ac1a6d5292caa7861 Mon Sep 17 00:00:00 2001 From: Samay10 Date: Wed, 15 Apr 2026 23:30:58 +0530 Subject: [PATCH 3/3] fix: reduce return statements in _extract_content to pass pylint R0911 - Precompute strip_text to consolidate duplicate returns (7->6) - Fixes lint-src CI failure for #414 --- langextract/core/format_handler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/langextract/core/format_handler.py b/langextract/core/format_handler.py index d41c7482..477356e7 100644 --- a/langextract/core/format_handler.py +++ b/langextract/core/format_handler.py @@ -288,6 +288,7 @@ def _extract_content(self, text: str) -> str: FormatParseError: When fences required but not found or multiple blocks found. """ + strip_text = text.strip() if not self.use_fences: matches = list(_FENCE_RE.finditer(text)) @@ -311,7 +312,7 @@ def _extract_content(self, text: str) -> str: if not self.strict_fences and len(matches) == 1: return matches[0].group("body").strip() - return text.strip() + return strip_text matches = list(_FENCE_RE.finditer(text)) @@ -352,7 +353,7 @@ def _extract_content(self, text: str) -> str: f"No {self.format_type.value} code block found." ) - return text.strip() + return strip_text # ---- Backward compatibility methods (to be removed in v2.0.0) ----