johnford2002 · johnford2002 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/docs/docs/03-configuration/02-different-ai-providers.md b/docs/docs/03-configuration/02-different-ai-providers.md
@@ -14,6 +14,31 @@ OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 # INFERENCE_IMAGE_MODEL=gpt-4o-mini
 ```
 
+## Anthropic (Claude)
+
+Karakeep has a native Anthropic provider (it does **not** go through Anthropic's
+OpenAI-compatibility endpoint), which means structured tagging output is enforced
+by the API.
+
+```
+ANTHROPIC_API_KEY=sk-ant-xxxxxxxxxxxxxxxxxxxxxxxx
+# Optional: point at a gateway/proxy instead of api.anthropic.com
+# ANTHROPIC_BASE_URL=https://your-gateway.example.com
+# Optional: override the default model (claude-haiku-4-5)
+INFERENCE_TEXT_MODEL=claude-haiku-4-5
+INFERENCE_IMAGE_MODEL=claude-haiku-4-5
+```
+
+Notes:
+
+- If `ANTHROPIC_API_KEY` is set, the Anthropic provider is used. If
+  `OPENAI_API_KEY` is also set, OpenAI takes precedence — unset it to use Claude.
+- The default model is `claude-haiku-4-5` (cheap and fast, suited to
+  high-volume auto-tagging). Set `INFERENCE_TEXT_MODEL` / `INFERENCE_IMAGE_MODEL`
+  to any Claude model (e.g. `claude-sonnet-4-6`) to override.
+- **Embeddings are not supported** by Anthropic. Semantic search needs a
+  separate embedding provider (OpenAI or Ollama).
+
 ## Ollama
 
 Ollama is a local LLM provider that you can use to run your own LLM server. You'll need to pass ollama's address to karakeep and you need to ensure that it's accessible from within the karakeep container (e.g. no localhost addresses).

diff --git a/packages/shared/config.ts b/packages/shared/config.ts
@@ -58,6 +58,8 @@ const allEnv = z.object({
   TURNSTILE_SITE_KEY: z.string().optional(),
   TURNSTILE_SECRET_KEY: z.string().optional(),
   OPENAI_API_KEY: z.string().optional(),
+  ANTHROPIC_API_KEY: z.string().optional(),
+  ANTHROPIC_BASE_URL: z.string().url().optional(),
   OPENAI_BASE_URL: z.string().url().optional(),
   OPENAI_PROXY_URL: z.string().url().optional(),
   OPENAI_TIMEOUT_SEC: z.coerce.number().positive().optional(),
@@ -316,11 +318,16 @@ const serverConfigSchema = allEnv.transform((val, ctx) => {
         : undefined,
     },
     inference: {
-      isConfigured: !!val.OPENAI_API_KEY || !!val.OLLAMA_BASE_URL,
+      isConfigured:
+        !!val.OPENAI_API_KEY ||
+        !!val.ANTHROPIC_API_KEY ||
+        !!val.OLLAMA_BASE_URL,
       numWorkers: val.INFERENCE_NUM_WORKERS,
       jobTimeoutSec: val.INFERENCE_JOB_TIMEOUT_SEC,
       fetchTimeoutSec: val.INFERENCE_FETCH_TIMEOUT_SEC,
       openAIApiKey: val.OPENAI_API_KEY,
+      anthropicApiKey: val.ANTHROPIC_API_KEY,
+      anthropicBaseUrl: val.ANTHROPIC_BASE_URL,
       openAIBaseUrl: val.OPENAI_BASE_URL,
       openAIProxyUrl: val.OPENAI_PROXY_URL,
       openAITimeoutSec: val.OPENAI_TIMEOUT_SEC,

diff --git a/packages/shared/inference.test.ts b/packages/shared/inference.test.ts
@@ -0,0 +1,108 @@
+import { beforeEach, describe, expect, it, vi } from "vitest";
+import { z } from "zod";
+
+// Mock the Anthropic SDK: default export is a class exposing messages.create.
+const createMock = vi.fn();
+vi.mock("@anthropic-ai/sdk", () => ({
+  default: class {
+    messages = { create: createMock };
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    constructor(public opts: any) {}
+  },
+}));
+
+import { AnthropicInferenceClient } from "./inference";
+
+function makeClient(overrides = {}) {
+  return new AnthropicInferenceClient({
+    apiKey: "test-key",
+    textModel: "gpt-4.1-mini",
+    imageModel: "gpt-4o-mini",
+    maxOutputTokens: 100,
+    outputSchema: "structured",
+    ...overrides,
+  });
+}
+
+beforeEach(() => {
+  createMock.mockReset();
+  createMock.mockResolvedValue({
+    content: [{ type: "text", text: '{"tags":["a"]}' }],
+    usage: { input_tokens: 10, output_tokens: 5 },
+  });
+});
+
+describe("AnthropicInferenceClient text inference", () => {
+  it("substitutes the Claude default when the model is the OpenAI default", async () => {
+    const client = makeClient();
+    await client.inferFromText("hi", { schema: null });
+    expect(createMock.mock.calls[0][0].model).toBe("claude-haiku-4-5");
+  });
+
+  it("preserves an explicitly configured Claude model", async () => {
+    const client = makeClient({ textModel: "claude-sonnet-4-6" });
+    await client.inferFromText("hi", { schema: null });
+    expect(createMock.mock.calls[0][0].model).toBe("claude-sonnet-4-6");
+  });
+
+  it("sends max_tokens and the user message, and returns text + summed tokens", async () => {
+    const client = makeClient();
+    const res = await client.inferFromText("hello", { schema: null });
+    const body = createMock.mock.calls[0][0];
+    expect(body.max_tokens).toBe(100);
+    expect(body.messages).toEqual([{ role: "user", content: "hello" }]);
+    expect(res.response).toBe('{"tags":["a"]}');
+    expect(res.totalTokens).toBe(15);
+  });
+
+  it("attaches output_config json_schema in structured mode when a schema is given", async () => {
+    const client = makeClient();
+    await client.inferFromText("hi", {
+      schema: z.object({ tags: z.array(z.string()) }),
+    });
+    const body = createMock.mock.calls[0][0];
+    expect(body.output_config.format.type).toBe("json_schema");
+    expect(body.output_config.format.schema).toBeTypeOf("object");
+  });
+
+  it("omits output_config in plain mode", async () => {
+    const client = makeClient({ outputSchema: "plain" });
+    await client.inferFromText("hi", {
+      schema: z.object({ tags: z.array(z.string()) }),
+    });
+    expect(createMock.mock.calls[0][0].output_config).toBeUndefined();
+  });
+
+  it("omits output_config when structured mode has no schema (e.g. summarization)", async () => {
+    const client = makeClient();
+    await client.inferFromText("summarize", { schema: null });
+    expect(createMock.mock.calls[0][0].output_config).toBeUndefined();
+  });
+});
+
+describe("AnthropicInferenceClient image inference", () => {
+  it("builds a base64 image content block with the given media type", async () => {
+    const client = makeClient({ outputSchema: "plain" });
+    await client.inferFromImage("describe", "image/png", "BASE64DATA", {
+      schema: null,
+    });
+    const body = createMock.mock.calls[0][0];
+    expect(body.model).toBe("claude-haiku-4-5");
+    expect(body.messages[0].content).toEqual([
+      { type: "text", text: "describe" },
+      {
+        type: "image",
+        source: { type: "base64", media_type: "image/png", data: "BASE64DATA" },
+      },
+    ]);
+  });
+});
+
+describe("AnthropicInferenceClient embeddings", () => {
+  it("rejects with a clear unsupported error", async () => {
+    const client = makeClient();
+    await expect(client.generateEmbeddingFromText(["x"])).rejects.toThrow(
+      /does not provide an embeddings API/,
+    );
+  });
+});
diff --git a/packages/shared/inference.ts b/packages/shared/inference.ts
@@ -1,3 +1,4 @@
+import Anthropic from "@anthropic-ai/sdk";
 import { Ollama } from "ollama";
 import OpenAI from "openai";
 import { zodResponseFormat } from "openai/helpers/zod";
@@ -157,13 +158,63 @@ export class InferenceClientFactory {
       return OpenAIInferenceClient.fromConfig();
     }
 
+    if (serverConfig.inference.anthropicApiKey) {
+      return AnthropicInferenceClient.fromConfig();
+    }
+
     if (serverConfig.inference.ollamaBaseUrl) {
       return OllamaInferenceClient.fromConfig();
     }
     return null;
   }
 }
 
+const ANTHROPIC_DEFAULT_MODEL = "claude-haiku-4-5";
+const OPENAI_DEFAULT_TEXT_MODEL = "gpt-4.1-mini";
+const OPENAI_DEFAULT_IMAGE_MODEL = "gpt-4o-mini";
+
+// If the configured model is still Karakeep's global OpenAI default, fall back to
+// a Claude model so that a zero-config Anthropic setup works (and we never send a
+// gpt-* id to Anthropic, which would 404).
+function resolveAnthropicModel(model: string, openAIDefault: string): string {
+  if (model === openAIDefault) {
+    logger.info(
+      `[inference] No Claude model set for the Anthropic provider; defaulting to ${ANTHROPIC_DEFAULT_MODEL}. Set INFERENCE_TEXT_MODEL/INFERENCE_IMAGE_MODEL to override.`,
+    );
+    return ANTHROPIC_DEFAULT_MODEL;
+  }
+  return model;
+}
+
+// Anthropic has no json_object mode. We use native Structured Outputs
+// (output_config.format) whenever a schema is supplied and the mode wants JSON.
+function buildAnthropicOutputConfig(
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  schema: z.ZodSchema<any> | null,
+  outputSchema: "structured" | "json" | "plain",
+) {
+  if (!schema || outputSchema === "plain") {
+    return undefined;
+  }
+  return {
+    format: {
+      type: "json_schema" as const,
+      schema: z.toJSONSchema(schema),
+    },
+  };
+}
+
+function extractAnthropicText(message: Anthropic.Message): string {
+  const text = message.content
+    .filter((b): b is Anthropic.TextBlock => b.type === "text")
+    .map((b) => b.text)
+    .join("");
+  if (!text) {
+    throw new Error(`Got no text content from Anthropic`);
+  }
+  return text;
+}
+
 export class OpenAIInferenceClient implements InferenceClient {
   openAI: OpenAI;
   private config: OpenAIInferenceConfig;
@@ -316,6 +367,134 @@ export class OpenAIInferenceClient implements InferenceClient {
   }
 }
 
+export interface AnthropicInferenceConfig {
+  apiKey: string;
+  baseURL?: string;
+  textModel: string;
+  imageModel: string;
+  maxOutputTokens: number;
+  outputSchema: "structured" | "json" | "plain";
+}
+
+export class AnthropicInferenceClient implements InferenceClient {
+  private anthropic: Anthropic;
+  private config: AnthropicInferenceConfig;
+  private textModel: string;
+  private imageModel: string;
+
+  constructor(config: AnthropicInferenceConfig) {
+    this.config = config;
+    this.textModel = resolveAnthropicModel(
+      config.textModel,
+      OPENAI_DEFAULT_TEXT_MODEL,
+    );
+    this.imageModel = resolveAnthropicModel(
+      config.imageModel,
+      OPENAI_DEFAULT_IMAGE_MODEL,
+    );
+    this.anthropic = new Anthropic({
+      apiKey: config.apiKey,
+      baseURL: config.baseURL,
+    });
+  }
+
+  static fromConfig(): AnthropicInferenceClient {
+    return new AnthropicInferenceClient({
+      apiKey: serverConfig.inference.anthropicApiKey!,
+      baseURL: serverConfig.inference.anthropicBaseUrl,
+      textModel: serverConfig.inference.textModel,
+      imageModel: serverConfig.inference.imageModel,
+      maxOutputTokens: serverConfig.inference.maxOutputTokens,
+      outputSchema: serverConfig.inference.outputSchema,
+    });
+  }
+
+  async inferFromText(
+    prompt: string,
+    _opts: Partial<InferenceOptions>,
+  ): Promise<InferenceResponse> {
+    const optsWithDefaults: InferenceOptions = {
+      ...defaultInferenceOptions,
+      ..._opts,
+    };
+    const outputConfig = buildAnthropicOutputConfig(
+      optsWithDefaults.schema,
+      this.config.outputSchema,
+    );
+    const message = await this.anthropic.messages.create(
+      {
+        model: this.textModel,
+        max_tokens: this.config.maxOutputTokens,
+        messages: [{ role: "user", content: prompt }],
+        ...(outputConfig ? { output_config: outputConfig } : {}),
+      },
+      { signal: optsWithDefaults.abortSignal },
+    );
+    return {
+      response: extractAnthropicText(message),
+      totalTokens:
+        (message.usage.input_tokens ?? 0) + (message.usage.output_tokens ?? 0),
+    };
+  }
+
+  async inferFromImage(
+    prompt: string,
+    contentType: string,
+    image: string,
+    _opts: Partial<InferenceOptions>,
+  ): Promise<InferenceResponse> {
+    const optsWithDefaults: InferenceOptions = {
+      ...defaultInferenceOptions,
+      ..._opts,
+    };
+    const outputConfig = buildAnthropicOutputConfig(
+      optsWithDefaults.schema,
+      this.config.outputSchema,
+    );
+    const message = await this.anthropic.messages.create(
+      {
+        model: this.imageModel,
+        max_tokens: this.config.maxOutputTokens,
+        messages: [
+          {
+            role: "user",
+            content: [
+              { type: "text", text: prompt },
+              {
+                type: "image",
+                source: {
+                  type: "base64",
+                  media_type: contentType as
+                    | "image/jpeg"
+                    | "image/png"
+                    | "image/gif"
+                    | "image/webp",
+                  data: image,
+                },
+              },
+            ],
+          },
+        ],
+        ...(outputConfig ? { output_config: outputConfig } : {}),
+      },
+      { signal: optsWithDefaults.abortSignal },
+    );
+    return {
+      response: extractAnthropicText(message),
+      totalTokens:
+        (message.usage.input_tokens ?? 0) + (message.usage.output_tokens ?? 0),
+    };
+  }
+
+  generateEmbeddingFromText(_inputs: string[]): Promise<EmbeddingResponse> {
+    return Promise.reject(
+      new Error(
+        "Anthropic does not provide an embeddings API. Configure a separate embedding provider (e.g. OpenAI or Ollama) for semantic search.",
+      ),
+    );
+  }
+}
+
 export interface OllamaInferenceConfig {
   baseUrl: string;
   textModel: string;

diff --git a/packages/shared/package.json b/packages/shared/package.json
@@ -5,6 +5,7 @@
   "private": true,
   "type": "module",
   "dependencies": {
+    "@anthropic-ai/sdk": "^0.104.1",
     "@aws-sdk/client-s3": "^3.1014.0",
     "glob": "^11.0.0",
     "html-to-text": "^9.0.5",