diff --git a/docs/cloud/features/02_integrations/02_data_catalogs.md b/docs/cloud/features/02_integrations/02_data_catalogs.md index 65b868f4a12..f65309be252 100644 --- a/docs/cloud/features/02_integrations/02_data_catalogs.md +++ b/docs/cloud/features/02_integrations/02_data_catalogs.md @@ -18,20 +18,25 @@ Through the integration, your catalog's tables will appear as queryable database Setup is available both via SQL command ([DataLakeCatalog](/engines/database-engines/datalakecatalog)) and via the ClickHouse Cloud UI on the Data Sources tab. Using the UI: + - Simplifies setup with a form using fields consistent with your Data Catalog objects - Provides a single interface for active data catalog integrations - Tests connections and credentials when saving ClickHouse Cloud UI with data catalog integrations -| Name | Open Table Format Supported | Auth Method | Support | Version | -|------|-----------------------------|----------------------------------------|---------|---------| -| AWS Glue Catalog | Iceberg | IAM/Access keys | Cloud & [Core](/use-cases/data-lake/glue-catalog) | 25.10+ | -| Lakekeeper | Iceberg | OAuth client credentials | [Core](/use-cases/data-lake/lakekeeper-catalog) | 25.10+ | -| Microsoft OneLake | Iceberg | Azure Active Directory (AAD) | Cloud & [Core](/use-cases/data-lake/onelake-catalog) | 25.12+ | -| Nessie | Iceberg | OAuth client credentials | [Core](/use-cases/data-lake/nessie-catalog) | 25.10+ | -| Polaris/Open Catalog | Iceberg | OAuth client credentials | [Core](/use-cases/data-lake/polaris-catalog) | 26.1+ | -| REST catalog | Iceberg | OAuth client credentials, Bearer token | Cloud & [Core](/use-cases/data-lake/rest-catalog) | 25.10+ | -| Unity Catalog | Iceberg (UniForm-enabled and managed), Delta | OAuth client credentials | Cloud (Iceberg only) & [Core](/use-cases/data-lake/unity-catalog) | 25.10+ | - -We have more catalogs planned, including Horizon and S3 tables REST endpoint. +For a step-by-step Cloud UI walkthrough, see [Connect a data catalog in ClickHouse Cloud](/integrations/data-catalogs). + +| Name | Open table format | Auth method | Cloud | Core | Version | +| ----------------- | ----------------- | -------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------ | ------- | +| AWS Glue Catalog | Iceberg | IAM role (26.2+), Access keys | [Guide](/integrations/data-catalogs?catalog=aws-glue#add-your-catalog-connection) | [Guide](/use-cases/data-lake/glue-catalog) | 25.10+ | +| BigLake Metastore | Iceberg | Google ADC (OAuth) | [Guide](/integrations/data-catalogs?catalog=biglake#add-your-catalog-connection) | [Guide](/use-cases/data-lake/biglake-catalog) | 26.2+ | +| Lakekeeper | Iceberg | OAuth client credentials | — | [Guide](/use-cases/data-lake/lakekeeper-catalog) | 25.10+ | +| Microsoft OneLake | Iceberg | Azure Active Directory (AAD) | [Guide](/integrations/data-catalogs?catalog=onelake#add-your-catalog-connection) | [Guide](/use-cases/data-lake/onelake-catalog) | 25.12+ | +| Nessie | Iceberg | OAuth client credentials | — | [Guide](/use-cases/data-lake/nessie-catalog) | 25.10+ | +| Polaris | Iceberg | OAuth client credentials | [Guide](/integrations/data-catalogs?catalog=polaris#add-your-catalog-connection) | [Guide](/use-cases/data-lake/polaris-catalog) | 26.1+ | +| REST catalog | Iceberg | OAuth client credentials, Bearer token | [Guide](/integrations/data-catalogs?catalog=rest#add-your-catalog-connection) | [Guide](/use-cases/data-lake/rest-catalog) | 25.10+ | +| Unity Catalog | Iceberg | OAuth client credentials | [Guide](/integrations/data-catalogs?catalog=unity-iceberg#add-your-catalog-connection) | [Guide](/use-cases/data-lake/unity-catalog) | 25.10+ | +| Unity Catalog | Delta | Personal Access Token (PAT) | [Guide](/integrations/data-catalogs?catalog=unity-delta#add-your-catalog-connection) | [Guide](/use-cases/data-lake/unity-catalog) | 25.10+ | + +We have more catalogs planned, including Horizon and S3 tables REST endpoint. \ No newline at end of file diff --git a/docs/integrations/data-catalogs/index.md b/docs/integrations/data-catalogs/index.md new file mode 100644 index 00000000000..dfdffba0b1d --- /dev/null +++ b/docs/integrations/data-catalogs/index.md @@ -0,0 +1,257 @@ +--- +sidebar_label: 'Data catalogs' +description: 'How to connect a data catalog for Iceberg or Delta tables in ClickHouse Cloud via the Data sources UI.' +slug: /integrations/data-catalogs +title: 'Connect a data catalog in ClickHouse Cloud' +doc_type: 'guide' +keywords: ['data catalogs', 'data lake', 'iceberg', 'unity catalog', 'glue', 'delta lake', 'onelake', 'polaris', 'biglake', 'clickhouse cloud'] +integration: + - support_level: 'core' + - category: 'data_lake' +--- + +import Image from '@theme/IdealImage'; +import BetaBadge from '@theme/badges/BetaBadge'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import catalog_flyout_select from '@site/static/images/integrations/data-catalogs/catalog-flyout-select.png'; +import linked_catalogs_table from '@site/static/images/integrations/data-catalogs/linked-catalogs-table.png'; +import catalog_tables_browser from '@site/static/images/integrations/data-catalogs/catalog-tables-browser.png'; +import catalog_sql_query from '@site/static/images/integrations/data-catalogs/catalog-sql-query.png'; +import data_catalogs_ui from '@site/static/images/cloud/features/data-catalogs-ui.png'; + + + +# Connect a data catalog in ClickHouse Cloud + +Connect ClickHouse Cloud to your data catalogs to access your open table format tables. You can set up connections in the **Data sources** UI. For setup via SQL, use the [`DataLakeCatalog`](/engines/database-engines/datalakecatalog) database engine in your SQL editor of choice. + +ClickHouse Cloud UI with data catalog integrations + +Once connected, catalog tables show up in the SQL console under the database name you choose. You can query them with standard ClickHouse SQL, join them with [MergeTree](/engines/table-engines/mergetree-family/mergetree) tables, and use them as sources for [materialized views](/materialized-views). + +## Prerequisites {#prerequisites} + +Before you connect a catalog, confirm the following: + +- **Service permissions.** You need the `control-plane:service:manage` permission to access the **Data sources** page and add catalogs. +- **Running service.** If the service is idle, wake it from the **Data sources** or the service settings pages before connecting or viewing linked catalogs. +- **Catalog credentials.** Gather connection details for your catalog type before opening the flyout. Each catalog uses different fields and authentication — see [Add your catalog connection](#add-your-catalog-connection) below. + +## Connect your catalog {#connect-your-catalog} + + + +### Open the data catalog flyout {#open-flyout} + +1. Select **Data sources** in the left navigation. +2. Click **+ Add catalog** if you haven't set up any data sources. Otherwise, click **Add data source** > **Add data lake catalog**. +3. In the **Connect your data catalog** flyout, select your catalog from the **Select catalog** dropdown. If the catalog supports multiple open table formats, choose the format in **Open table format**. + +Connect your data catalog flyout with Select catalog dropdown showing AWS Glue, BigLake Metastore, Microsoft OneLake, Polaris, REST Catalog, and Unity Catalog + +### Add your catalog connection {#add-your-catalog-connection} + +Select your catalog below for guidance and prerequisites. Then fill in the connection parameters, along with a **Database name** — the ClickHouse database that exposes your catalog tables in the SQL console. + + + + +[AWS Glue Catalog](/use-cases/data-lake/glue-catalog) exposes [Iceberg](/engines/table-engines/integrations/iceberg) tables registered in the Glue Data Catalog. + +Before you connect, confirm: + +- ClickHouse version 25.12+. +- Iceberg tables are registered in AWS Glue Data Catalog in your target region. +- For access key authentication, you have an IAM user access key with permissions to read Glue metadata and the underlying S3 objects. +- For IAM role authentication (26.2+), you have an IAM role that trusts your ClickHouse service role. Include the service role ARN from **Settings → Network security information** in the role trust policy. See [Accessing Iceberg data securely](/cloud/data-sources/secure-iceberg) for IAM policy examples. + +In the flyout, enter your AWS **Region** (e.g. `us-west-2`), then choose an authentication method: + +**Access key authentication** + +1. Select **AWS Access Key** as the **Authentication method**. +2. Enter your **Access Key ID** and **Secret Access Key**. +3. Enter a **Database name** for the ClickHouse database that exposes your Glue tables. + +**IAM role authentication (26.2+)** + +1. Select **AWS IAM Role** as the **Authentication method**. +2. Copy the **Service role ID (IAM)** from the flyout panel and add it to your IAM role trust policy. +3. Enter your **AWS Role ARN** and an optional **AWS Role Session Name**. +4. Enter a **Database name** for the ClickHouse database that exposes your Glue tables. + +:::note +Glue supports multiple table formats, but ClickHouse only reads **Iceberg** tables from Glue. +::: + +
+
+ + +Query Unity Catalog managed [Iceberg](/engines/table-engines/integrations/iceberg) tables using OAuth client credentials from a Databricks service principal. See the [Unity Catalog guide](/use-cases/data-lake/unity-catalog#read-iceberg) for full setup. + +Before you connect, confirm: + +- ClickHouse version 25.12+. +- [Unity Catalog is configured for external data access](https://docs.databricks.com/aws/en/external-access/admin). +- Databricks [service principal](https://docs.databricks.com/aws/en/dev-tools/auth/oauth-m2m) with OAuth client ID and secret. The service principal has `USE CATALOG`, `USE SCHEMA`, `USE EXTERNAL SCHEMA` and `SELECT` privileges on the tables you want to query. + +In the flyout: + +1. Enter your Databricks **Workspace URL** (e.g. `dbc-1234567a-cbde`). +2. Enter the **Databricks catalog name** to connect (e.g. `icebench`). +3. Enter the OAuth **Client ID** and **Client secret** for your service principal. +4. Enter a **Database name** for the ClickHouse database that exposes your Unity Catalog tables. +
+
+ + +Query Unity Catalog [Delta Lake](/engines/table-engines/integrations/deltalake) tables using a Databricks Personal Access Token (PAT). See the [Unity Catalog guide](/use-cases/data-lake/unity-catalog#read-delta) for full setup. + +Before you connect, confirm: + +- ClickHouse version 25.12+. +- [Unity Catalog is configured for external data access](https://docs.databricks.com/aws/en/external-access/admin). +- Databricks [Personal Access Token](https://docs.databricks.com/aws/en/dev-tools/auth/pat) with at least `EXTERNAL USE SCHEMA`, `USE CATALOG`, `USE SCHEMA`, and `SELECT` on the target tables. + +In the flyout: + +1. Enter your Databricks **Workspace URL** (e.g. `dbc-1234567a-cbde.azuredatabricks.net`). +2. Enter the **Databricks catalog name** to connect. +3. Enter your **Personal Access Token**. +4. Enter a **Database name** for the ClickHouse database that exposes your Delta tables. + +:::note +Iceberg and Delta use different authentication in the UI. This will require two separate ClickHouse databases to access both types of tables. +::: +
+
+ + +Connect to any catalog that implements the [Iceberg REST Catalog](https://github.com/apache/iceberg/blob/main/open-api/rest-catalog-open-api.yaml) specification. See the [REST catalog guide](/use-cases/data-lake/rest-catalog) for full setup. + +Before you connect, confirm: + +- ClickHouse version 25.12+. +- Your REST catalog endpoint is reachable from ClickHouse Cloud. +- You have OAuth client credentials or a bearer token, depending on your catalog configuration. +- You have an S3 or compatible **Storage Endpoint** URI for table data (e.g. `s3://my-bucket/path`). + +In the flyout: + +1. Enter the **Catalog URL** (e.g. `https://catalog.example.com/v1`). +2. Enter the **Warehouse** or catalog namespace (e.g. `demo`). +3. Enter the **Storage Endpoint** URI prefix for table storage. +4. Select an **Authentication method**: **OAuth Client Credentials** or **Bearer Token**, then enter the matching credentials. +5. Enter a **Database name** for the ClickHouse database that exposes your REST catalog tables. +
+
+ + +Query [Iceberg](/engines/table-engines/integrations/iceberg) tables in Microsoft Fabric OneLake using Azure AD application credentials. See the [Fabric OneLake guide](/use-cases/data-lake/onelake-catalog) for full setup. + +Before you connect, confirm: + +- ClickHouse version 25.12+. +- Iceberg tables exist in a Fabric workspace. +- You have an Entra ID (Azure AD) application with client ID and secret. +- You have your tenant ID, workspace ID, and a data item ID. Use your **Lakehouse ID** from the Lakehouse page URL. See [Microsoft OneLake prerequisites](https://learn.microsoft.com/en-us/fabric/onelake/table-apis/table-apis-overview#prerequisites) for help locating these values. + +In the flyout: + +1. Enter your Fabric **Workspace ID**. +2. Enter the **Data Item ID** — use your Lakehouse ID. Warehouse IDs aren't supported. +3. Enter your Entra ID **Tenant ID**, **Application (client) ID**, and **Client secret**. +4. Enter a **Database name** for the ClickHouse database that exposes your OneLake tables. +
+
+ + +Connect to a Snowflake Open Catalog (Polaris) deployment for [Iceberg](/engines/table-engines/integrations/iceberg) tables. See the [Polaris catalog guide](/use-cases/data-lake/polaris-catalog) for full setup. + +Before you connect, confirm: + +- ClickHouse version 26.2+. +- You have a Polaris catalog with OAuth client credentials. +- You have a storage endpoint URI for Iceberg table data (e.g. `s3://company-iceberg-prod/warehouse/`). + +In the flyout: + +1. Enter the **Catalog Account Identifier** (e.g. `ab12345.snowflakecomputing.com`). +2. Enter the **Catalog Name** (e.g. `snowflake_open_catalog`). +3. Enter the OAuth **Client ID** and **Client Secret**. +4. Enter the **Storage Endpoint** URI prefix for table storage. +5. Enter a **Database name** for the ClickHouse database that exposes your Polaris tables. +
+
+ + +Connect to Google Cloud BigLake Metastore (aka Lakehouse runtime catalog) for [Iceberg](/engines/table-engines/integrations/iceberg) tables in GCS. See the [BigLake Metastore guide](/use-cases/data-lake/biglake-catalog) for full setup. + +Before you connect, confirm: + +- ClickHouse version 26.2+. +- You have a BigLake Metastore instance with Iceberg tables in GCS. +- You have Google Application Default Credentials (ADC) with client ID, client secret, refresh token, and quota project ID. + +In the flyout: + +1. Enter your **Google ADC Client ID**, **Client Secret**, **Refresh Token**, and **Quota Project ID**. +2. Enter the **Cloud Storage Bucket** URI for table data (e.g. `gs://biglake-public-nyc-taxi-iceberg`). +3. Enter a **Database name** for the ClickHouse database that exposes your BigLake tables. +
+
+ +
+
+ +### Save the connection {#save-connection} + +After filling in the fields: + +1. Click **Add catalog**. ClickHouse validates the connection and credentials when saving. +2. On success, a confirmation toast appears with a **View in SQL console** link. Your catalog is listed in the **Linked catalogs** table with its connection status and table count. + +From the **Actions** menu on a linked catalog row, you can drop the catalog connection. Dropping removes the catalog's database from ClickHouse — it doesn't delete any data in your external catalog. + +
+ +## Query your data {#query-data} + + + +### View your catalog tables {#view-tables} + +On the **Data sources** page, find your catalog in the **Linked catalogs** table and click **View tables**. + +Linked catalogs table with View tables action + +ClickHouse opens the SQL console with your catalog database selected and lists the available tables. + +:::note +Tables may not appear immediately after you connect a catalog. Depending on the catalog, it can take up to 3 minutes for all tables to be listed. +::: + +SQL console table browser showing catalog tables + +### Run a query {#run-a-query} + +Write a query in the SQL editor and click **Run**. Wrap the full table name in backticks: + +```sql +SELECT * FROM `identity_profiles.identity_profiles_iceberg` +``` + +SQL query with results using backticks for dotted table name + + + +## Troubleshooting {#troubleshooting} + +- If you don't see your tables in the SQL console: verify credentials, network access, and table types in the catalog. Make sure the tables you expect to see are in supported file and table formats. + +## See also {#see-also} + +- [Accelerating analytics on lakehouse data](/use-cases/data-lake/getting-started/accelerating-analytics) — load catalog tables into MergeTree for repeated queries +- [Accessing Iceberg data securely](/cloud/data-sources/secure-iceberg) — IAM role setup for AWS Iceberg and Glue access diff --git a/i18n/en/code.json b/i18n/en/code.json index 97c5b080f5b..79aa8d66953 100644 --- a/i18n/en/code.json +++ b/i18n/en/code.json @@ -1533,6 +1533,12 @@ "sidebar.dropdownCategories.category.Integrations.Data sources.description": { "message": "Load data into ClickHouse from your prefered source" }, + "sidebar.dropdownCategories.category.Integrations.Data lakes": { + "message": "Data lakes" + }, + "sidebar.dropdownCategories.category.Integrations.Data lakes.description": { + "message": "Connect a data catalog for Iceberg or Delta tables" + }, "sidebar.dropdownCategories.category.Integrations.Data visualization": { "message": "Data visualization" }, diff --git a/sidebars.js b/sidebars.js index 290a020fb6a..2c7a0020ef4 100644 --- a/sidebars.js +++ b/sidebars.js @@ -1035,6 +1035,15 @@ const sidebars = { }, ], }, + { + type: 'category', + label: 'Data lakes', + collapsed: true, + collapsible: true, + items: [ + 'integrations/data-catalogs/index', + ], + }, { type: 'category', label: 'Data visualization', diff --git a/src/theme/Navbar/Content/MenuData.js b/src/theme/Navbar/Content/MenuData.js index 10aa4c91490..6046df1e25d 100644 --- a/src/theme/Navbar/Content/MenuData.js +++ b/src/theme/Navbar/Content/MenuData.js @@ -252,6 +252,12 @@ export const dropdownCategories = [ description: Load data into ClickHouse from your prefered source, href: "/integrations/data-sources/index" }, + { + type: "link", + label: Data lakes, + description: Connect a data catalog for Iceberg or Delta tables, + href: "/integrations/data-catalogs" + }, { type: "link", label: Data visualization, diff --git a/static/images/cloud/features/data-catalogs-ui.png b/static/images/cloud/features/data-catalogs-ui.png index 2865c4fbcf3..83798cdf101 100644 Binary files a/static/images/cloud/features/data-catalogs-ui.png and b/static/images/cloud/features/data-catalogs-ui.png differ diff --git a/static/images/integrations/data-catalogs/catalog-flyout-select.png b/static/images/integrations/data-catalogs/catalog-flyout-select.png new file mode 100644 index 00000000000..cd5833f8235 Binary files /dev/null and b/static/images/integrations/data-catalogs/catalog-flyout-select.png differ diff --git a/static/images/integrations/data-catalogs/catalog-sql-query.png b/static/images/integrations/data-catalogs/catalog-sql-query.png new file mode 100644 index 00000000000..c14e56351cc Binary files /dev/null and b/static/images/integrations/data-catalogs/catalog-sql-query.png differ diff --git a/static/images/integrations/data-catalogs/catalog-tables-browser.png b/static/images/integrations/data-catalogs/catalog-tables-browser.png new file mode 100644 index 00000000000..de2c290bbdc Binary files /dev/null and b/static/images/integrations/data-catalogs/catalog-tables-browser.png differ diff --git a/static/images/integrations/data-catalogs/linked-catalogs-table.png b/static/images/integrations/data-catalogs/linked-catalogs-table.png new file mode 100644 index 00000000000..fcbe86ef1da Binary files /dev/null and b/static/images/integrations/data-catalogs/linked-catalogs-table.png differ