Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/examples/japanese_extraction.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ for entity in result.extractions:
if entity.char_interval:
start, end = entity.char_interval.start_pos, entity.char_interval.end_pos
position_info = f" (pos: {start}-{end})"

print(f"• {entity.extraction_class}: {entity.extraction_text}{position_info}")

# Expected Output:
Expand Down
1 change: 1 addition & 0 deletions langextract/_compat/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

"""Compatibility shim for langextract.exceptions imports."""

# pylint: disable=duplicate-code

from __future__ import annotations
Expand Down
1 change: 1 addition & 0 deletions langextract/_compat/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

"""Compatibility shim for langextract.registry imports."""

# pylint: disable=duplicate-code

from __future__ import annotations
Expand Down
1 change: 1 addition & 0 deletions langextract/_compat/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

"""Compatibility shim for langextract.schema imports."""

# pylint: disable=duplicate-code

from __future__ import annotations
Expand Down
1 change: 1 addition & 0 deletions langextract/core/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

"""Base interfaces for language models."""

from __future__ import annotations

import abc
Expand Down
1 change: 1 addition & 0 deletions langextract/core/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

"""Classes used to represent core data types of annotation pipeline."""

from __future__ import annotations

import dataclasses
Expand Down
1 change: 1 addition & 0 deletions langextract/core/debug_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

"""Debug utilities for LangExtract."""

from __future__ import annotations

import functools
Expand Down
1 change: 1 addition & 0 deletions langextract/core/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

"""Core schema abstractions for LangExtract."""

from __future__ import annotations

import abc
Expand Down
1 change: 1 addition & 0 deletions langextract/core/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

"""Core data types for LangExtract."""

from __future__ import annotations

import dataclasses
Expand Down
1 change: 1 addition & 0 deletions langextract/data_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

"""Library for data conversion between AnnotatedDocument and JSON."""

from __future__ import annotations

import dataclasses
Expand Down
1 change: 1 addition & 0 deletions langextract/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
This module re-exports exceptions from core.exceptions for backward compatibility.
All new code should import directly from langextract.core.exceptions.
"""

# pylint: disable=duplicate-code

from __future__ import annotations
Expand Down
1 change: 1 addition & 0 deletions langextract/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

"""Supports Input and Output Operations for Data Annotations."""

from __future__ import annotations

import abc
Expand Down
1 change: 1 addition & 0 deletions langextract/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
This module provides centralized provider discovery without circular imports.
It supports both built-in providers and third-party providers via entry points.
"""

from __future__ import annotations

import functools
Expand Down
1 change: 1 addition & 0 deletions langextract/progress.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

"""Progress and visualization utilities for LangExtract."""

from __future__ import annotations

from typing import Any
Expand Down
1 change: 1 addition & 0 deletions langextract/prompting.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

"""Library for building prompts."""

from __future__ import annotations

import dataclasses
Expand Down
5 changes: 5 additions & 0 deletions langextract/providers/builtin_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,9 @@ class ProviderConfig(TypedDict):
'target': 'langextract.providers.openai:OpenAILanguageModel',
'priority': patterns.OPENAI_PRIORITY,
},
{
'patterns': patterns.MINIMAX_PATTERNS,
'target': 'langextract.providers.minimax:MiniMaxLanguageModel',
'priority': patterns.MINIMAX_PRIORITY,
},
]
1 change: 1 addition & 0 deletions langextract/providers/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

"""Gemini provider for LangExtract."""

# pylint: disable=duplicate-code

from __future__ import annotations
Expand Down
228 changes: 228 additions & 0 deletions langextract/providers/minimax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
# Copyright 2025 Google LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""MiniMax provider for LangExtract.

This provider uses MiniMax's OpenAI-compatible API to extract structured
information from text.

Usage:
# Using factory
from langextract.factory import ModelConfig, create_model

config = ModelConfig(
model_id="MiniMax-M2.5",
provider="MiniMaxLanguageModel",
provider_kwargs={
"api_key": "your-minimax-api-key"
}
)
model = create_model(config)

result = lx.extract(
text_or_documents=text,
prompt_description=instructions,
model=model
)
"""

from __future__ import annotations

import dataclasses
from typing import Any

from langextract.core import base_model
from langextract.core import data
from langextract.providers import patterns
from langextract.providers import router

_DEFAULT_MODEL_ID = "MiniMax-M2.5"
_DEFAULT_BASE_URL = "https://api.minimax.io/v1"


@router.register(
*patterns.MINIMAX_PATTERNS,
priority=patterns.MINIMAX_PRIORITY,
)
@dataclasses.dataclass(init=False)
class MiniMaxLanguageModel(base_model.BaseLanguageModel):
"""Language model inference using MiniMax's OpenAI-compatible API."""

model_id: str = _DEFAULT_MODEL_ID
api_key: str | None = None
base_url: str = _DEFAULT_BASE_URL
organization: str | None = None
format_type: data.FormatType = data.FormatType.JSON
temperature: float | None = None
max_workers: int = 10
_client: Any = dataclasses.field(default=None, repr=False, compare=False)
_extra_kwargs: dict[str, Any] = dataclasses.field(
default_factory=dict, repr=False, compare=False
)

@property
def requires_fence_output(self) -> bool:
"""MiniMax returns raw JSON without fences."""
if self.format_type == data.FormatType.JSON:
return False
return super().requires_fence_output

def __post_init__(self):
"""Initialize the OpenAI client with MiniMax configuration."""
try:
from openai import AsyncOpenAI
except ImportError as e:
raise ImportError(
"OpenAI package is required for MiniMax provider. "
"Install with: pip install langextract[openai]"
) from e

if self._client is None:
self._client = AsyncOpenAI(
api_key=self.api_key,
base_url=self.base_url,
organization=self.organization,
**self._extra_kwargs,
)

async def _generate(
self,
texts: list[str],
prompt_description: str,
extra_params: dict[str, Any] | None = None,
) -> list[list[base_model.ExtractionCandidate]]:
"""Generate extractions for the given texts."""
import asyncio

extra_params = extra_params or {}

async def process_single(text: str) -> list[base_model.ExtractionCandidate]:
response = await self._client.chat.completions.create(
model=self.model_id,
messages=[
{
"role": "system",
"content": (
"You are a helpful assistant that extracts structured"
" information from text."
),
},
{
"role": "user",
"content": f"{prompt_description}\n\nText: {text}",
},
],
response_format={"type": "json_object"}
if self.format_type == data.FormatType.JSON
else None,
temperature=self.temperature,
**extra_params,
)

content = response.choices[0].message.content
if not content:
return []

try:
import json

data = json.loads(content)
# Wrap in ExtractionCandidate format
if isinstance(data, list):
return [
base_model.ExtractionCandidate(
extraction_text=item.get("text", str(item)),
extraction_class=item.get("class", "unknown"),
extraction_index=i,
)
for i, item in enumerate(data)
]
elif isinstance(data, dict):
# For single object extractions
return [
base_model.ExtractionCandidate(
extraction_text=str(v),
extraction_class=k,
extraction_index=i,
)
for i, (k, v) in enumerate(data.items())
]
except (json.JSONDecodeError, AttributeError):
pass

return [
base_model.ExtractionCandidate(
extraction_text=content,
extraction_class="extracted",
extraction_index=0,
)
]

# Process texts in parallel
tasks = [process_single(text) for text in texts]
results = await asyncio.gather(*tasks)
return results

def _generate_sync(
self,
texts: list[str],
prompt_description: str,
extra_params: dict[str, Any] | None = None,
) -> list[list[base_model.ExtractionCandidate]]:
"""Synchronous wrapper for generation."""
import asyncio

try:
loop = asyncio.get_event_loop()
if loop.is_running():
# If we're in an async context, we need to create a new loop
# This is a simplified sync wrapper - for production use async directly
import concurrent.futures

def run_in_executor():
return asyncio.run(
self._generate(texts, prompt_description, extra_params)
)

with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
future = executor.submit(run_in_executor)
return future.result()
except RuntimeError:
# No event loop, run directly
return asyncio.run(
self._generate(texts, prompt_description, extra_params)
)

def __call__(
self,
texts: Sequence[str],
prompt_description: str,
extra_params: dict[str, Any] | None = None,
) -> list[list[base_model.ExtractionCandidate]]:
"""Synchronous interface for the model."""
return self._generate_sync(list(texts), prompt_description, extra_params)

async def _call_async(
self,
texts: Sequence[str],
prompt_description: str,
extra_params: dict[str, Any] | None = None,
) -> list[list[base_model.ExtractionCandidate]]:
"""Asynchronous interface for the model."""
return await self._generate(list(texts), prompt_description, extra_params)

def close(self):
"""Close the client connection."""
# AsyncOpenAI doesn't need explicit close
pass
1 change: 1 addition & 0 deletions langextract/providers/ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
2. Pull the model: ollama pull gemma2:2b
3. Ollama server will start automatically when you use extract()
"""

# pylint: disable=duplicate-code

from __future__ import annotations
Expand Down
1 change: 1 addition & 0 deletions langextract/providers/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

"""OpenAI provider for LangExtract."""

# pylint: disable=duplicate-code

from __future__ import annotations
Expand Down
Loading
Loading