From b19f17f8b3f0461d342483471c5f0d517760aae3 Mon Sep 17 00:00:00 2001 From: Pierre Zemb Date: Wed, 25 Jun 2025 13:55:28 +0200 Subject: [PATCH 1/4] add flake --- .envrc | 1 + .gitignore | 1 + flake.lock | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ flake.nix | 26 +++++++++++++++++++++++ 4 files changed, 89 insertions(+) create mode 100644 .envrc create mode 100644 flake.lock create mode 100644 flake.nix diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..3550a30 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use flake diff --git a/.gitignore b/.gitignore index 7585238..63d69d5 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ book +.direnv \ No newline at end of file diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..5c7fe9a --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1750741721, + "narHash": "sha256-Z0djmTa1YmnGMfE9jEe05oO4zggjDmxOGKwt844bUhE=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "4b1164c3215f018c4442463a27689d973cffd750", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..112c177 --- /dev/null +++ b/flake.nix @@ -0,0 +1,26 @@ +{ + description = "A dev environment for the fdb-book"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = import nixpkgs { + inherit system; + }; + in + { + devShells.default = pkgs.mkShell { + buildInputs = with pkgs; [ + mdbook + mdbook-toc + mdbook-mermaid + mdbook-linkcheck + ]; + }; + }); +} From bb6c559f9165f0b9751769a9de0c58bb263f3031 Mon Sep 17 00:00:00 2001 From: Pierre Zemb Date: Wed, 25 Jun 2025 14:23:13 +0200 Subject: [PATCH 2/4] ai improvements --- src/SUMMARY.md | 24 +- src/develop_layer/crafting-row-keys.md | 261 +++++-------------- src/develop_layer/studiable-layers.md | 53 ++-- src/develop_layer/tips.md | 87 +++---- src/develop_layer/transactions.md | 63 ++++- src/getting_started/fdbcli.md | 246 ++++------------- src/getting_started/installation.md | 46 ++++ src/getting_started/installing-fdb.md | 52 ---- src/internals/roles.md | 46 ++++ src/internals/the-read-path.md | 34 +++ src/internals/the-write-path.md | 47 ++++ src/meet_fdb/another_db.md | 170 ++++++------ src/meet_fdb/correctess.md | 187 ------------- src/meet_fdb/correctness.md | 57 ++++ src/meet_fdb/enter_fdb.md | 96 ++++--- src/meet_fdb/everything_is_kv.md | 101 ++----- src/the-record-layer/quick.md | 1 + src/the-record-layer/what-is-record-layer.md | 1 + src/welcome.md | 21 +- 19 files changed, 653 insertions(+), 940 deletions(-) create mode 100644 src/getting_started/installation.md delete mode 100644 src/getting_started/installing-fdb.md create mode 100644 src/internals/roles.md create mode 100644 src/internals/the-read-path.md create mode 100644 src/internals/the-write-path.md delete mode 100644 src/meet_fdb/correctess.md create mode 100644 src/meet_fdb/correctness.md create mode 100644 src/the-record-layer/quick.md create mode 100644 src/the-record-layer/what-is-record-layer.md diff --git a/src/SUMMARY.md b/src/SUMMARY.md index 082b64e..920b19b 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -7,26 +7,26 @@ - [Yet another database?](./meet_fdb/another_db.md) - [Enter FoundationDB](./meet_fdb/enter_fdb.md) - [Everything is a key-value!](./meet_fdb/everything_is_kv.md) -- [Correct and robust, choose both](./meet_fdb/correctess.md) +- [A Culture of Correctness](./meet_fdb/correctness.md) # Getting started with FoundationDB -- [Installing FoundationDB](./getting_started/installing-fdb.md) -- [Playing with `fdbcli`](./getting_started/fdbcli.md) +- [Installation](./getting_started/installation.md) +- [The `fdbcli` Command-Line Interface](./getting_started/fdbcli.md) # Developing a layer -- [ACID transactions](./develop_layer/transactions.md) -- [Crafting row keys](./develop_layer/crafting-row-keys.md) -- [Existing open-source layers](./develop_layer/studiable-layers.md) -- [Tips and tricks](./develop_layer/tips.md) +- [Core Concept: Transactions](./develop_layer/transactions.md) +- [Key Design: Tuples, Subspaces, and Directories](./develop_layer/crafting-row-keys.md) +- [Learning from the Community: Open-Source Layers](./develop_layer/studiable-layers.md) +- [Best Practices and Pitfalls](./develop_layer/tips.md) # FoundationDB's internals -- [TODO: the list of roles]() -- [TODO: the write-path]() -- [TODO: the read-path]() +- [Anatomy of a Cluster: Roles](./internals/roles.md) +- [The Write Path](./internals/the-write-path.md) +- [The Read Path](./internals/the-read-path.md) # The Record-Layer -- [TODO: What is the Record-layer?]() -- [TODO: QuiCK]() +- [What is the Record-layer?](./the-record-layer/what-is-record-layer.md) +- [QuiCK](./the-record-layer/quick.md) diff --git a/src/develop_layer/crafting-row-keys.md b/src/develop_layer/crafting-row-keys.md index 28d016b..f9f6bd9 100644 --- a/src/develop_layer/crafting-row-keys.md +++ b/src/develop_layer/crafting-row-keys.md @@ -1,235 +1,106 @@ -# Crafting row keys +# Key Design: Tuples, Subspaces, and Directories -## Row key? +In a key-value store, the way you structure your keys is one of the most important architectural decisions you will make. Your key schema determines how your data is organized, how efficiently you can query it, and how well your workload will scale. This chapter introduces FoundationDB's powerful, layered abstractions for key management. -When you are using a key/value store, the design of the `row key` is extremely important, as this will define how well: +## The Challenge: Hand-Crafting Keys -* your scans will be optimized, -* your puts will be spread, -* you will avoid `hot-spotting` a shard/region. +At the lowest level, a key is just a sequence of bytes. In many key-value systems, developers are forced to manually craft these byte arrays, a process that is both tedious and error-prone. -If you need more information on `row keys`, I recommend going through these links before moving on: +Consider storing user data. You might decide on a key structure like `(user_id, attribute_name)`. To implement this, you would need to write code that serializes the `user_id` (an integer) and the `attribute_name` (a string) into a single byte array, taking care to handle different data types, lengths, and ordering correctly. This is brittle; a small change in the format can break your application. -* ["Designing your schema" BigTable documentation](https://cloud.google.com/bigtable/docs/schema-design) -* ["Rowkey Design" HBase documentation](https://hbase.apache.org/book.html#rowkey.design) +FoundationDB provides a much better way. -## Hand-crafting row keys +## Layer 1: The `Tuple` -Most of the time, you will need to craft the `row key` "by hand", like this for [an HBase's app](https://github.com/senx/warp10-platform/blob/879734d7f63791b487f3e535cd79ac4c23e99377/warp10/src/main/java/io/warp10/continuum/store/Store.java#L1215-L1222): +The most fundamental building block for key design is the **Tuple**. A tuple is an ordered collection of elements of different types (like strings, integers, and UUIDs). The FoundationDB client libraries provide a `pack()` function that serializes a tuple into a byte string that correctly preserves type and ordering. -```java -// Prefix + classId + labelsId + timestamp -// 128 bits -byte[] rowkey = new byte[Constants.HBASE_RAW_DATA_KEY_PREFIX.length + 8 + 8 + 8]; +```python +import fdb.tuple -System.arraycopy(Constants.HBASE_RAW_DATA_KEY_PREFIX, 0, rowkey, 0, Constants.HBASE_RAW_DATA_KEY_PREFIX.length); -// Copy classId/labelsId -System.arraycopy(Longs.toByteArray(msg.getClassId()), 0, rowkey, Constants.HBASE_RAW_DATA_KEY_PREFIX.length, 8); -System.arraycopy(Longs.toByteArray(msg.getLabelsId()), 0, rowkey, Constants.HBASE_RAW_DATA_KEY_PREFIX.length + 8, 8); -``` - -Or maybe you will wrap things in a function [like this in Go](https://github.com/pingcap/tidb/blob/ef57bdbbb04f60a8be744060a99207e08a37514a/tablecodec/tablecodec.go#L80-L86): - -```go -// EncodeRowKey encodes the table id and record handle into a kv.Key -func EncodeRowKey(tableID int64, encodedHandle []byte) kv.Key { - buf := make([]byte, 0, prefixLen+len(encodedHandle)) - buf = appendTableRecordPrefix(buf, tableID) - buf = append(buf, encodedHandle...) - return buf -} -``` +# A tuple containing a string and an integer +user_profile_tuple = ('user', 12345) -Each time, you need to wrap the complexity of converting your objects to a row-key, by creating a buffer and write stuff in it. +# Pack the tuple into a byte key +key = fdb.tuple.pack(user_profile_tuple) -In our Java example, there is an interesting comment: +# The result is a byte string suitable for use as a key +# b'\x02user\x00\x1509' +print(repr(key)) -```java -// Prefix + classId + labelsId + timestamp +# You can unpack the bytes back into the original tuple +unpacked_tuple = fdb.tuple.unpack(key) +assert unpacked_tuple == user_profile_tuple ``` -If we are replacing some characters, we are not really far from: +This simple abstraction solves the manual serialization problem. You can think in terms of structured data, and the tuple layer handles the byte-level representation for you. Because the packing format is standardized across all language bindings, a key packed in Python can be unpacked in Go, Java, or any other language. -```java -// (Prefix, classId, labelsId, timestamp) -``` - -Which looks like a `Tuple`(a collection of values of different types) and this is what FoundationDB is using as an abstraction to create keys 😍 +## Layer 2: The `Subspace` -## FDB's abstractions and helpers +Building on tuples, the next layer of abstraction is the **Subspace**. A subspace is a way to create a dedicated namespace within the database for a particular category of data. It works by defining a prefix tuple that is automatically prepended to all keys packed within that subspace. -### Tuple +This is a powerful organizational tool. For example, you can create separate subspaces for user data, application settings, and logging events. -Instead of crafting bytes by hand, we are `packing` a Tuple: +```python +import fdb.tuple -```rust -// create a Tuple with ("tenant-42", 1) -let tuple = (String::from("tenant-42"), 1); +# Create a subspace for storing user profiles +user_subspace = fdb.Subspace(('users',)) -// and compute a row-key from the Tuple -let row_key = foundationdb::tuple::pack::<(String, i64)>(&tuple); -``` +# Now, keys created within this subspace will be prefixed +# with the packed representation of ('users',) +key1 = user_subspace.pack(('alice',)) +key2 = user_subspace.pack(('bob',)) -The generated row-key will be readable from any bindings, as it's construction is standardized. Let's print it: +# key1 is now b'\x02users\x00\x02alice\x00' +# key2 is now b'\x02users\x00\x02bob\x00' +print(repr(key1)) +print(repr(key2)) -```rust -// and print-it in hexa -println!("{:#04X?}", row_key); +# You can also use the subspace to unpack a key, +# which strips the prefix automatically. +unpacked = user_subspace.unpack(key1) +assert unpacked == ('alice',) ``` -```log -// can be verified with https://www.utf8-chartable.de/unicode-utf8-table.pl -[ - 0x02, - 0x74, // t - 0x65, // e - 0x6E, // n - 0x61, // a - 0x6E, // n - 0x74, // t - 0x2D, // - - 0x31, // 1 - 0x00, - 0x15, - 0x2A, // 42 -] -``` +Subspaces allow you to isolate data and perform range scans over a specific category of information without worrying about colliding with other parts of your keyspace. -As you can see, `pack` added some extra-characters. There are used to recognized the next type, a bit like when you are encoding/decoding some wire protocols. You can find the relevant documentation [here](https://github.com/apple/foundationdb/blob/master/design/tuple.md). +## Layer 3: The `Directory` -Having this kind of standard means that we can easily decompose/`unpack` it: +The highest level of abstraction is the **Directory**. Directories are a tool for managing subspaces. While you can create subspaces with fixed prefixes (like `('users',)`), directories allow you to create and manage them dynamically. -```rust -// retrieve the user and the magic number In a Tuple (String, i64) -let from_row_key = foundationdb::tuple::unpack::<(String, i64)>(&row_key)?; - -println!("user='{}', magic_number={}", from_row_key.0, from_row_key.1); -// user='tenant-42', magic_number=42 -``` - -Now that we saw `Tuples`, let's dig in the next abstraction: `subspaces` - -### Subspace - -When you are working with key-values store, we are often playing with what we call `keyspaces`, by dedicating a portion of the key to an usage, like this for example: - -```text -/users/tenant-1/... -/users/tenant-2/... -/users/tenant-3/... -``` +> Directories are the recommended approach for organizing the keyspace of one or more applications. -Here, `/users/tenant-1/` can be view like a prefix where we will put all the relevant keys. Instead of passing a simple prefix, FoundationDB is offering a dedicated structure called a `Subspace`: +A directory allows you to associate a human-readable path (like `('users', 'profiles')`) with a short, randomly generated integer prefix. This has two major benefits: -> A Subspace represents a well-defined region of keyspace in a FoundationDB database +1. **Shorter Keys:** The generated prefix is much shorter than the packed representation of the full path, saving space. +2. **Schema Management:** You can list, move, and remove directories. Moving a directory is a fast metadata-only change; it doesn't require rewriting all the keys within it. -> It provides a convenient way to use FoundationDB tuples to define namespaces for different categories of data. The namespace is specified by a prefix tuple which is prepended to all tuples packed by the subspace. When unpacking a key with the subspace, the prefix tuple will be removed from the result. +```python +import fdb +import fdb.directory -As you can see, the `Subspace` is heavily relying on FoundationDB's tuples, as we can `pack` and `unpack` it. +fdb.api_version(710) +db = fdb.open() -> As a best practice, API clients should use at least one subspace for application data. +# Create or open a directory at a specific path +app_dir = fdb.directory.create_or_open(db, ('my-app',)) -Well, as we have now the tools to handle keyspaces easily, it is now futile to craft keys by hand 🙃 Let's create a subspace! +# Create subspaces within that directory +users_subspace = app_dir.create_or_open(db, ('users',)) +logs_subspace = app_dir.create_or_open(db, ('logs',)) -```rust +# The key for the 'users' subspace might be b'\x15\x01', a much +# shorter prefix than packing the full path. +print(repr(users_subspace.key())) -// create a subspace from the Tuple ("tenant-1", 42) -let subspace = Subspace::from((String::from("tenant-1"), 42)); +# You can now use this subspace as before +@fdb.transactional +def set_user(tr, name): + tr[users_subspace.pack((name,))] = b'some_profile_data' -// let's print the range -println!("start: {:#04X?}\n end: {:#04X?}", subspace.range().0, subspace.range().1); +set_user(db, 'charlie') ``` -We can see observe this: - -```log -// can be verified with https://www.utf8-chartable.de/unicode-utf8-table.pl -start: [ - 0x02, - 0x74, // t - 0x65, // e - 0x6E, // n - 0x61, // a - 0x6E, // n - 0x74, // t - 0x2D, // - - 0x31, // 1 - 0x00, - 0x15, - 0x2A, // 42 - 0x00, - 0x00, // smallest possible byte -] -end: [ - 0x02, - 0x74, // t - 0x65, // e - 0x6E, // n - 0x61, // a - 0x6E, // n - 0x74, // t - 0x2D, // - - 0x31, // 1 - 0x00, - 0x15, - 0x2A, // 42 - 0x00, - 0xFF, // biggest possible byte -] -``` - -Which make sens, if we take `("tenant-1", 42)` as a prefix, then the range for this subspace will be between `("tenant-1", 42, 0x00)` and `("tenant-1", 42, 0xFF)` - -### Directory - -Now that we know our way around `Tuples` and `Subspaces`, we can now talk about what I'm working on, which is the `Directory`. Let's have a look at the relevant [documentation](https://apple.github.io/foundationdb/developer-guide.html#directories): - -> FoundationDB provides directories (available in each language binding) as a tool for managing related subspaces. - -> Directories are a recommended approach for administering applications. Each application should create or open at least one directory to manage its subspaces. - -Okay, let's see the API in Go: - -```go -subspace, err := directory.CreateOrOpen(db, []string{"application", "my-app", "tenant", "tenant-42"}, nil) -if err != nil { - log.Fatal(err) -} - -fmt.Printf("%+v\n", subspace.Bytes()) -// [21 18] -``` - -We can see that we have a shorter subspace! The `directory` allows you to generate some integer that will be bind to a path, like here `"application", "my-app", "tenant", "tenant-42"`. - -There are two advantages to this: - -* shorter keys, -* cheap metadata operations like `List` or `Move`: - -```go -// list all tenant in "application", "my-app": -tenants, err := directory.List(db, []string{"application", "my-app", "tenant"}) -if err != nil { - log.Fatal(err) -} -fmt.Printf("%+v\n", tenants) -// [tenant-42] - -// renaming 'tenant-42' in 'tenant-142' -// This will NOT move the data, only the metadata is modified -directorySubspace, err = directory.Move(db, - []string{"application", "my-app", "tenant", "tenant-42"}, // old path - []string{"application", "my-app", "tenant", "tenant-142"}) // new path -if err != nil { - log.Fatal(err) -} -fmt.Printf("%+v\n", directorySubspace.Bytes()) -// still [21 18] -``` - -The returned object is actually a `DirectorySubspace`, which implements both `Directory` and `Subspace`, which means that you can use it to recreate many directories and subspaces at will 👌 - -> If you are wondering about how this integer is generated, I recommend going through this awesome blogpost on [how high contention allocator works in FoundationDB.](https://activesphere.com/blog/2018/08/05/high-contention-allocator) \ No newline at end of file +By using these three layers—Tuples for serialization, Subspaces for namespacing, and Directories for management—you can build sophisticated and maintainable data models on top of the simple key-value interface. \ No newline at end of file diff --git a/src/develop_layer/studiable-layers.md b/src/develop_layer/studiable-layers.md index f76ea49..639c173 100644 --- a/src/develop_layer/studiable-layers.md +++ b/src/develop_layer/studiable-layers.md @@ -1,34 +1,41 @@ -# Existing open-source layers +# Learning from the Community: Open-Source Layers -The community has released a number of layers that can be studied, including: +One of the best ways to learn how to build on FoundationDB is to study existing, production-proven layers. A "layer" is simply a library or service that provides a higher-level data model on top of FoundationDB's ordered key-value store. By examining how these layers map their data models to keys and values, you can gain invaluable insights for your own projects. + +Here are some of the most prominent open-source layers developed by the FoundationDB community. -## the Record-layer +## The Record Layer + +The Record Layer provides a structured, record-oriented data store on top of FoundationDB, similar to a traditional relational database. It is used in production at Apple to power CloudKit. + +* **GitHub Repo:** [foundationdb/fdb-record-layer](https://github.com/foundationdb/fdb-record-layer) +* **Academic Paper:** [FoundationDB Record Layer](https://www.foundationdb.org/files/record-layer-paper.pdf) +* **Key Videos:** + * [FoundationDB Record Layer: Open Source Structured Storage on FoundationDB](https://youtu.be/HLE8chgw6LI) (Nicholas Schiefer, Apple) + * [Using FoundationDB and the FDB Record Layer to Build CloudKit](https://youtu.be/SvoUHHM9IKU) (Scott Gray, Apple) + +## The Document Layer + +The Document Layer implements a MongoDB®-compatible API, allowing you to store and query JSON documents within FoundationDB. -* [Github repo](https://github.com/foundationdb/fdb-record-layer) -* [Academic paper](https://www.foundationdb.org/files/record-layer-paper.pdf) -* Videos: - * [FoundationDB Record Layer: Open Source Structured Storage on FoundationDB - Nicholas Schiefer, Apple](https://youtu.be/HLE8chgw6LI) - * [Using FoundationDB and the FDB Record Layer to Build CloudKit - Scott Gray, Apple](https://youtu.be/SvoUHHM9IKU) - * [Meta-Data Caching and Record Store Scalability - Alec Grieser, Apple](https://youtu.be/_mDIhQ1HLcs) +* **GitHub Repo:** [FoundationDB/fdb-document-layer](https://github.com/FoundationDB/fdb-document-layer) +* **Key Video:** + * [FoundationDB Document Layer](https://youtu.be/KPqmB13zI9c) (Bhaskar Muppana, Apple) -## the Document Layer +## The ZooKeeper Layer -* [Github repo](https://github.com/FoundationDB/fdb-document-layer) -* Videos: - * [FoundationDB Document Layer - Bhaskar Muppana, Apple](https://youtu.be/KPqmB13zI9c) +This layer implements the Apache ZooKeeper API, providing a distributed coordination service built on FoundationDB. -## the Zookeeper Layer +* **GitHub Repo:** [pH14/fdb-zk](https://github.com/pH14/fdb-zk) +* **Key Video:** + * [A ZooKeeper Layer for FoundationDB](https://youtu.be/3FYpf1QMPgQ) (Paul Hemberger, HubSpot) -* [Github repo](https://github.com/pH14/fdb-zk) -* Forum posts: - * [Fdb-zk: rough cut of Zookeeper API layer](https://forums.foundationdb.org/t/fdb-zk-rough-cut-of-zookeeper-api-layer/1278) -* Videos: - * [A ZooKeeper Layer for FoundationDB - Paul Hemberger, HubSpot](https://youtu.be/3FYpf1QMPgQ) +## The Time-Series Layer -## TimeSeries layer +This is a high-performance layer written in Go, designed specifically for storing and querying time-series data with high compression. -* [Github repo](https://github.com/richardartoul/tsdb-layer) -* Videos: - * [Time Series and FoundationDB: Millions of Writes/s and 10x Compression in 2000 Lines of Go](https://www.youtube.com/watch?v=W6yQ9Pwgb1A) +* **GitHub Repo:** [richardartoul/tsdb-layer](https://github.com/richardartoul/tsdb-layer) +* **Key Video:** + * [Time Series and FoundationDB: Millions of Writes/s and 10x Compression in 2000 Lines of Go](https://www.youtube.com/watch?v=W6yQ9Pwgb1A) diff --git a/src/develop_layer/tips.md b/src/develop_layer/tips.md index 18902a1..5b093c7 100644 --- a/src/develop_layer/tips.md +++ b/src/develop_layer/tips.md @@ -1,80 +1,63 @@ -# Tips and tricks +# Best Practices and Pitfalls - - -Here's a few tips and tricks that should help you develop a production-ready layer. - -## About the `run` method - -Most bindings are offering a `run` method, that is taking a closure, like this in [Java](https://apple.github.io/foundationdb/javadoc/com/apple/foundationdb/Database.html#run(java.util.function.Function)). - -You should use the `run` method on your bindings, **BUT** you must add some transactionOptions to avoid blocking forever: +This chapter provides a collection of best practices, advanced techniques, and common pitfalls to help you build robust, production-ready layers on FoundationDB. -* **Timeout** Set a timeout in milliseconds which, when elapsed, will cause the transaction automatically to be cancelled. -* **RetryLimit** Set a maximum number of retries - -it is safe and legal to set these options at the first line of your `run` closure. - -## Transaction priority - -There is 3 transaction priority in FDB: + -* **Default**, -* **Batch** Specifies that this transaction should be treated as low priority and that default priority transactions will be processed first. Useful for doing batch work simultaneously with latency-sensitive work -* **SystemImmediate** Specifies that this transaction should be treated as highest priority and that lower priority transactions should block behind this one. Use is discouraged outside of low-level tools. +## Transaction Management -## Use transaction Tagging +### Use Timeouts and Retry Limits -> FoundationDB provides the ability to add arbitrary byte-string tags to transactions. The cluster can be configured to limit the rate of transactions with certain tags, either automatically in response to tags that are very busy, or manually using the throttle command in fdbcli. +Most language bindings provide a `run` or `transact` method that automatically handles the retry loop for you. However, to prevent transactions from running indefinitely, it is critical to configure two options: -More info can be found [here](https://apple.github.io/foundationdb/transaction-tagging.html). +* **Timeout:** Set a timeout in milliseconds. If the transaction takes longer than this to commit, it will be automatically cancelled. This is a crucial backstop for preventing stuck application threads. +* **Retry Limit:** Set a maximum number of retries. This prevents a transaction from retrying endlessly in the case of a persistent conflict or a live-lock scenario. -## Traces logs +These options should be set on every transaction to ensure your application remains stable under load. -By default, clients do not generate trace logs. In order to enable traces from the clients, you must enable on the database-level: +### Set Transaction Priority -* `TraceEnable` Enables trace output to a file in a directory of the clients choosing +FoundationDB supports transaction priorities to help manage workloads. -You can also enable these optional options: +* **Default:** The standard priority for most latency-sensitive, user-facing operations. +* **Batch:** A lower priority for background work, such as data cleanup or analytics. Batch priority transactions will yield to default priority transactions, ensuring that they don't interfere with your main application workload. +* **System Immediate:** The highest priority, which can block other transactions. Its use is discouraged outside of low-level administrative tools. -* `TraceFormat` Select the format of the log files. xml (the default) and json are supported. -* `TraceLogGroup` Sets the ‘LogGroup’ attribute with the specified value for all events in the trace output files. The default log group is ‘default’. +## Observability -Then, on a Transaction-level, you can set these options: +### Tag Your Transactions -* `DebugTransactionIdentifier` String identifier to be used when tracing or profiling this transaction. The identifier must not exceed 100 characters. -* `LogTransaction` Enables tracing for this transaction and logs results to the client trace logs. The DEBUG_TRANSACTION_IDENTIFIER option must be set before using this option, and client trace logging must be enabled to get log output. -* `TransactionLoggingMaxFieldLength` Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option, after which the field will be truncated. A negative value disables truncation. +FoundationDB allows you to add a byte-string **tag** to any transaction. This is an invaluable tool for observability and performance management. You can use tags to identify different types of workloads (e.g., `user_signup`, `post_comment`). The `fdbcli` tool can then be used to monitor the rate of transactions with specific tags and even throttle them if they are causing excessive load. -## The `metadataVersion` +See the official documentation on [Transaction Tagging](https://apple.github.io/foundationdb/transaction-tagging.html) for more details. -The metadata version key `\xFF/metadataVersion` is a key intended to help layers deal with hot keys. The value of this key is sent to clients along with the read version from the proxy, so a client can read its value without communicating with a storage server. +### Enable Client Trace Logs -It is useful to implement some caching-strategy on a layer. More info on how to use the metadataVersion key can be found [here](https://forums.foundationdb.org/t/sharing-the-metadataversionkey-for-multiple-tenants/1659). +By default, clients do not generate detailed trace logs. To debug performance issues, you can enable them by setting the `TraceEnable` database option. You can then add a `DebugTransactionIdentifier` to a specific transaction and set the `LogTransaction` option to get detailed, low-level logs about its execution, including all keys and values read and written. -⚠️ In a transaction, if you update the \xff/metadataVersion key, and then attempt to read it again, I get a “Read or wrote an unreadable key” error (1036) when trying to read again. Context can be found [here](https://forums.foundationdb.org/t/cannot-commit-transaction-that-reads-the-metadataversion-key-after-changing-it/1833) +## Advanced Techniques -More info about the `metadataVersion` can be found [here](https://youtu.be/2HiIgbxtx0c). +### The `metadataVersion` Key -## The TimeKeeper +The special key `\xFF/metadataVersion` is a cluster-wide version counter that can be used to implement client-side caching. Its value is sent to clients with every read version, so reading it does not require a round-trip to a storage server. A layer can watch this key to know when to invalidate a local cache. -`Cluster Controller` actor is keeping a map of read version to system clock time, updated every second. Can be accessible by scanning the `\xFF\x02/timeKeeper/map/`. More info [here](https://forums.foundationdb.org/t/approximating-a-global-clock-for-a-watchdog-timer-using-versionstamps-readversions-or-the-timekeeper/477) +**Note:** If you write to the `metadataVersion` key, you cannot read it again in the same transaction. -## Special keys +### The `TimeKeeper` -> Keys starting with the bytes \xff\xff are called “special” keys, and they are materialized when read. \xff\xff/status/json is an example of a special key. +The `Cluster Controller` maintains a map of recent read versions to wall-clock times. This can be accessed by scanning the key range beginning with `\xFF\x02/timeKeeper/map/`. This can be useful for approximating a global clock. -More info can be found [here](https://apple.github.io/foundationdb/special-keys.html) +### Special Keys -## Caveats when using directory +Keys prefixed with `\xFF\xFF` are “special” keys that are materialized on-demand when read. The most common example is `\xFF\xFF/status/json`, which returns a JSON document containing the cluster's status. -* [Mutating a directory multiple times simultaneously in the same transaction is unsafe](https://github.com/apple/foundationdb/issues/895) -* Opening a path with multiple items will generate many read on the metadata-subspace for every transaction. This can lead to hotspotting the [Directory Layer’s metadata subspace](https://forums.foundationdb.org/t/query-hotspotting-on-directory-layers-metadata-subspace/2487). Also, developing a directory that use the [metadataVersion's key](https://github.com/apple/foundationdb/pull/1213) is not that easy: - * [Exhibit A](https://forums.foundationdb.org/t/how-to-safely-add-a-metadata-caching-layer-on-top-of-existing-layers/1809/2?u=pierrez), - * [Exhibit B](https://github.com/apple/foundationdb/issues/1415). +See the official documentation on [Special Keys](https://apple.github.io/foundationdb/special-keys.html) for more details. -* Because on how the prefix is generated, multi-cluster deployments can allocate the same [prefix multiple times](https://forums.foundationdb.org/t/redwood-engine-and-directory-layer/3084/8): +## Common Pitfalls: The Directory Layer -* Redwood Engine is a new storage engine released in FDB-7.0. It has some nice features, including native key-prefix compression. Prefix compression's performance is likely to be [similar to using the Directory](https://youtu.be/5iqKu1pVDvE?t=158). When using the Redwood storage engine the remaining benefit of the Directory becomes the ability to move/rename directories and having smaller keys in network messages (though some of these may eventually use prefix compression). +The Directory Layer is a powerful tool, but it has several sharp edges that developers must be aware of: -> I’ll also note that due to caching, the Record Layer can’t really make use of the directory layer’s renaming features (at least not without rethinking cache invalidation). I suspect that if we’d had Redwood and prefix compression when the Record Layer was being originally developed, we’d seriously have considered just relying on prefix compression instead of all of that because that would have significantly simplified cross-cluster data movement (and, if we’re honest, single cluster writes). +* **Concurrent Mutations:** Modifying the same directory (e.g., creating two different subdirectories within it) in multiple, concurrent transactions is not safe and can lead to corruption. +* **Metadata Hotspots:** Opening a directory with a long path requires one read per path element for every transaction. This can create a hotspot on the directory's internal metadata subspace. +* **Multi-Cluster Deployments:** The directory prefix allocator is not safe for multi-cluster deployments and can allocate the same prefix in different clusters, leading to data corruption if the data is ever merged. +* **Redwood and Prefix Compression:** The Redwood storage engine (new in 7.0) provides native key-prefix compression. This offers many of the same space-saving benefits as the Directory Layer without the associated complexity and caveats. For new projects, especially those using Redwood, consider whether you can use subspaces with descriptive prefixes directly instead of relying on the Directory Layer. diff --git a/src/develop_layer/transactions.md b/src/develop_layer/transactions.md index 8c88c50..fb35f7f 100644 --- a/src/develop_layer/transactions.md +++ b/src/develop_layer/transactions.md @@ -1,11 +1,66 @@ -# Transactions +# Core Concept: Transactions -## How transactions works in FoundationDB? +Transactions are the heart of FoundationDB. They are the mechanism that allows you to build complex, multi-key operations while guaranteeing the database remains consistent. FoundationDB provides some of the strongest transactional guarantees in the industry, and understanding how they work is the key to building reliable layers. -TODO +## Strictly Serializable ACID Transactions -## Avoiding conflicts +FoundationDB offers fully **ACID** (Atomicity, Consistency, Isolation, Durability) transactions. But it goes a step further by providing the strongest isolation level: **strict serializability**. This means that transactions behave as if they were executed one at a time, in some sequential order. You are completely insulated from the complexities of concurrent operations; you can write your code as if you are the only person using the database. + +## The Transactional Pattern: Read, Write, Commit + +Working with a transaction in FoundationDB follows a simple but powerful pattern: + +1. **Create a transaction object.** +2. **Read from the database.** You can read one or more keys. FoundationDB remembers which keys you've read (your *read set*). +3. **Write to the database.** You can write one or more keys. These writes are buffered locally in the transaction object (your *write set*). +4. **Commit the transaction.** The client sends the transaction (its read and write sets) to the cluster for validation. + +This is an **optimistic** concurrency model. The database *optimistically* assumes that your transaction will not conflict with others. The check only happens at the very end, during the commit phase. + +## Conflict Resolution: The Retry Loop + +What happens if another transaction commits a change to a key you *read* before your own transaction commits? This is a **conflict**. When the cluster detects a conflict, it will reject your transaction with a retryable error. + +This is not a failure; it's a core part of the design. Your application code is expected to catch this error and simply retry the entire transaction from the beginning. This is known as the **transactional retry loop**. + +Here is a pseudo-code example of this fundamental pattern: + +```python +# A simplified view of the FDB transaction pattern +def transfer_funds(db, from_acct, to_acct, amount): + # The retry loop is handled for you by the @transactional decorator + # in most language bindings. + @fdb.transactional + def _do_transfer(tr): + # 1. Read the current balances + from_balance = tr[from_acct].wait() + to_balance = tr[to_acct].wait() + + # If we read a null value, the account doesn't exist + if from_balance is None or to_balance is None: + raise Exception("Account not found") + + from_balance = int.from_bytes(from_balance, 'big') + to_balance = int.from_bytes(to_balance, 'big') + + # 2. Perform local logic + if from_balance < amount: + raise Exception("Insufficient funds") + + # 3. Write the new balances to the local transaction buffer + tr[from_acct] = (from_balance - amount).to_bytes(8, 'big') + tr[to_acct] = (to_balance + amount).to_bytes(8, 'big') + + # 4. The commit happens automatically when the function returns + + # Execute the transaction + _do_transfer(db) +``` + +If another transaction modifies `from_acct` or `to_acct` after our transaction has read them but before it has committed, the commit will fail. The `@transactional` decorator will automatically catch the error and re-run the `_do_transfer` function from the beginning. This simple, powerful loop ensures that your logic always runs on a consistent snapshot of the database. + +For a deeper dive into how to avoid conflicts and design your data models for high-contention workloads, this talk is an excellent resource: \ No newline at end of file diff --git a/src/getting_started/fdbcli.md b/src/getting_started/fdbcli.md index 762431c..4e73cc5 100644 --- a/src/getting_started/fdbcli.md +++ b/src/getting_started/fdbcli.md @@ -1,35 +1,37 @@ -# Playing with fdbcli +# The `fdbcli` Command-Line Interface -## Setting up FoundationDB with Docker +`fdbcli` is the primary tool for administering and interacting with your FoundationDB cluster. This chapter will walk you through the basics of using it to create a database and perform fundamental key-value operations. -```shell -docker run --name fdb -d -p 4500:4500 --rm foundationdb/foundationdb:6.3.23 -``` +## Your First Cluster: A Local Sandbox -## `fdbcli`? +The easiest way to get started is to run a local FoundationDB server using Docker. This command will start a container with FoundationDB, publish its port, and name it `fdb` for easy access. -`fdbcli` is a command-line interface that can be used to administrate your FDB cluster. +```shell +docker run --name fdb -d -p 4500:4500 --rm foundationdb/foundationdb:7.1.25 +``` -## Start the shell +Now, you can connect to the running container and start the CLI: ```shell -docker exec -ti fdb fdbcli +docker exec -it fdb fdbcli +``` -docker exec -ti fdb fdbcli -Using cluster file `/var/fdb/fdb.cluster'. +You'll be greeted with a welcome message and a warning: + +``` +Using cluster file '/var/fdb/fdb.cluster'. The database is unavailable; type `status' for more information. Welcome to the fdbcli. For help, type `help'. +fdb> ``` -## Useful commands - -### Status +## Creating the Database -`status` is one of the most useful command. It will display a human-readable report: +The database is unavailable because we haven't configured it yet. The `status` command provides a detailed report on the cluster's health. Let's see what it says: ```shell fdb> status @@ -43,225 +45,87 @@ no database has been created. 172.17.0.2:4500 (reachable) Unable to locate the data distributor worker. - -Unable to locate the ratekeeper worker. ``` - -### Create a new database - -As our container is brand-new, we need to create a database. +The message is clear: "no database has been created." Let's fix that. The `configure` command sets up the database with a specific redundancy mode and storage engine. For a local sandbox, `single` redundancy and the `memory` storage engine are perfect. ```shell fdb> configure new single memory Database created ``` -A few elements to notes: +**Warning:** Running `configure new` on an existing cluster will destroy all of its data. It is only safe to use on a brand-new cluster. -* running this will nuke your database, -* `single` is the [redundancy mode](https://apple.github.io/foundationdb/configuration.html#choosing-a-redundancy-mode), -* `memory` is the [storage subsystem](https://apple.github.io/foundationdb/configuration.html#configuring-the-storage-subsystem). - -Then we can run the `status` command: - -```shell -fdb> help status - -status [minimal|details|json] - -Get the status of a FoundationDB cluster. - -If the cluster is down, this command will print a diagnostic which may be useful -in figuring out what is wrong. If the cluster is running, this command will -print cluster statistics. - -Specifying `minimal' will provide a minimal description of the status of your -database. - -Specifying `details' will provide load information for individual workers. - -Specifying `json' will provide status information in a machine readable JSON -format. -``` +Now, if we check the status again, we'll see a healthy, fully-operational cluster. ```shell fdb> status -Using cluster file `/var/fdb/fdb.cluster'. - Configuration: Redundancy mode - single - Storage engine - memory-2 + Storage engine - memory Coordinators - 1 - Usable Regions - 1 Cluster: FoundationDB processes - 1 - Zones - 1 Machines - 1 - Memory availability - 8.0 GB per process on machine with least available - Fault Tolerance - 0 machines - Server time - 02/10/22 12:57:50 Data: Replication health - Healthy Moving data - 0.000 GB - Sum of key-value sizes - 0 MB - Disk space used - 105 MB -Operating space: - Storage server - 1.0 GB free on most full server - Log server - 1555.9 GB free on most full server +... +``` -Workload: - Read rate - 7 Hz - Write rate - 0 Hz - Transactions started - 3 Hz - Transactions committed - 0 Hz - Conflict rate - 0 Hz +## Reading and Writing Data -Backup and DR: - Running backups - 0 - Running DRs - 0 +Now for the fun part. `fdbcli` allows you to perform transactional reads and writes directly from the command line. Let's try to `get` a key that doesn't exist yet. -Client time: 02/10/22 12:57:50 +```shell +fdb> get mykey +`mykey' not found ``` -### Insert keys +As expected, nothing is there. Let's `set` a value. ```shell -fdb> help writemode - -writemode - -Enables or disables sets and clears. - -Setting or clearing keys from the CLI is not recommended. +fdb> set mykey "hello world" +Committed. ``` -```shell -fdb> help set - -set - -Set a value for a given key. - -If KEY is not already present in the database, it will be created. - -For information on escaping keys and values, type `help escaping'. -``` +Now, if we `get` it again, we see our value. Notice that we use quotes because our value contains a space. ```shell -# we first need to set writemode - -fdb> writemode on -fdb> set hello world -Committed (1442988082) +fdb> get mykey +`mykey' is `hello world' ``` -`1442988082` is the commitVersion or versionStamp. - -### scan keys +We can also store multiple keys and retrieve them with `getrange`. ```shell -fdb> help getrange - -getrange [ENDKEY] [LIMIT] - -Fetch key/value pairs in a range of keys. - -Displays up to LIMIT keys and values for keys between BEGINKEY (inclusive) and -ENDKEY (exclusive). If ENDKEY is omitted, then the range will include all keys -starting with BEGINKEY. LIMIT defaults to 25 if omitted. - -For information on escaping keys, type `help escaping' +fdb> set key1 value1 +Committed. +fdb> set key2 value2 +Committed. +fdb> set key3 value3 +Committed. + +fdb> getrange key1 key4 + +Range limited to 25 keys +`key1' is `value1' +`key2' is `value2' +`key3' is `value3' ``` +Finally, we can remove a key using `clear`. + ```shell -fdb> getrange \x00 \xfe 10 +fdb> clear mykey +Committed. -Range limited to 10 keys -`hello' is `world' +fdb> get mykey +`mykey' not found ``` -### Help - -```shell -fdb> help - -List of commands: - - advanceversion: - Force the cluster to recover at the specified version - begin: - begin a new transaction - clear: - clear a key from the database - clearrange: - clear a range of keys from the database - commit: - commit the current transaction - configure: - change the database configuration - consistencycheck: - permits or prevents consistency checking - coordinators: - change cluster coordinators or description - exclude: - exclude servers from the database - exit: - exit the CLI - fileconfigure: - change the database configuration from a file - force_recovery_with_data_loss: - Force the database to recover into DCID - get: - fetch the value for a given key - getrange: - fetch key/value pairs in a range of keys - getrangekeys: - fetch keys in a range of keys - getversion: - Fetch the current read version - help: - get help about a topic or command - include: - permit previously-excluded servers to rejoin the database - kill: - attempts to kill one or more processes in the cluster - lock: - lock the database with a randomly generated lockUID - maintenance: - mark a zone for maintenance - option: - enables or disables an option - profile: - namespace for all the profiling-related commands. - reset: - reset the current transaction - rollback: - rolls back the current transaction - set: - set a value for a given key - setclass: - change the class of a process - sleep: - sleep for a period of time - status: - get the status of a FoundationDB cluster - suspend: - attempts to suspend one or more processes in the cluster - throttle: - view and control throttled tags - triggerddteaminfolog: - trigger the data distributor teams logging - unlock: - unlock the database with the provided lockUID - writemode: - enables or disables sets and clears - -For information on a specific command, type `help '. -For information on escaping keys and values, type `help escaping'. -For information on available options, type `help options'. -``` +This is just a small sample of what `fdbcli` can do. You can use the `help` command to see a full list of commands or `help ` for details on a specific one. diff --git a/src/getting_started/installation.md b/src/getting_started/installation.md new file mode 100644 index 0000000..75805ec --- /dev/null +++ b/src/getting_started/installation.md @@ -0,0 +1,46 @@ +# Installation + + + +This chapter guides you through setting up FoundationDB. While the official documentation provides detailed, platform-specific instructions, this guide will help you understand the components and make the right choices for your setup. + +* [Official Docs: Getting Started on Linux](https://apple.github.io/foundationdb/getting-started-linux.html) +* [Official Docs: Getting Started on macOS](https://apple.github.io/foundationdb/getting-started-mac.html) + +## Do I Need the Client or the Server? + +The first step is to decide which package you need from the [Downloads page](https://apple.github.io/foundationdb/downloads.html). + +### The Server Package + +Install the **server** package if you want to run a FoundationDB database cluster on a machine. This is for you if you are: + +* Setting up a new development environment on your local machine. +* Provisioning a server to be part of a production cluster. + +This package contains the core `fdbserver` binary, which runs the database, and `fdbmonitor`, which manages the server process. + +### The Client Package + +Install the **client** package if a machine only needs to connect to an *existing* FoundationDB cluster. This is for you if you are: + +* Building an application that uses a FoundationDB binding (e.g., in Python, Go, or Java). +* Using command-line tools like `fdbcli` to administer a remote cluster. + +This package provides the necessary C libraries (`libfdb_c.so`) that all language bindings depend on, as well as several administrative tools. + +## The Cluster File: Your Key to the Cluster + +Whether you install the client or the server, you will get a **cluster file** (e.g., `/etc/foundationdb/fdb.cluster`). This small text file is critically important: + +> The cluster file contains the IP addresses and ports of the coordination servers. It's how any client or server finds and connects to the database. + +To connect to a cluster, your client machine must have a copy of that cluster's `fdb.cluster` file. When you set up a new server, one is created for you. When you set up a client to talk to an existing cluster, you must copy the file from the cluster to your client machine. + +## ⚠️ A Critical Note on Versioning + +FoundationDB enforces strict compatibility between client and server versions. This is a common source of confusion for new users. + +**The Rule:** The client and server **must** have the same major and minor version numbers. For example, a client with version `7.1.x` can only talk to a server with version `7.1.y`. It **cannot** talk to a server running `7.2.z` or `6.3.w`. + +If you mix versions, your application will fail to connect, often by hanging indefinitely. The server logs will show `ConnectionRejected` errors with the reason `IncompatibleProtocolVersion`. Always ensure your client machines and server cluster are running compatible versions. \ No newline at end of file diff --git a/src/getting_started/installing-fdb.md b/src/getting_started/installing-fdb.md deleted file mode 100644 index ea715a5..0000000 --- a/src/getting_started/installing-fdb.md +++ /dev/null @@ -1,52 +0,0 @@ -# Installing FoundationDB - - - -The official documentation has plenty of docs on how to install FoundationDB: - -* [Getting Started on Linux](https://apple.github.io/foundationdb/getting-started-linux.html) -* [Getting Started on macOS](https://apple.github.io/foundationdb/getting-started-mac.html) - -## Client or server? - -In the [Downloads page](https://apple.github.io/foundationdb/downloads.html), you will find reference to two archives: - -* clients -* server - -### Clients - -The clients package is required by all bindings(i.e. programming libraries). These are needed files for all bindings: - -* `/usr/lib/libfdb_c.so` -* `/usr/include/foundationdb/fdb_c.h` -* `/usr/include/foundationdb/fdb_c_options.g.h` - -You will also find different binaries: - -* dr_agent -* fdbbackup -* fdbcli -* fdbdr -* fdbrestore - -### Server - -The server package is holding FDB's binaries: - -* fdbmonitor -* fdbserver - -And a default configuration file for `fdbmonitor` located at `/etc/foundationdb/foundationdb.conf`. `fdbserver` is the main binary, and `fdbmonitor` is simply a wrapper on top of `fdbserver`. - -## The cluster file - -Both packages will install a default [cluster file](https://apple.github.io/foundationdb/administration.html#cluster-files): - -> FoundationDB servers and clients use a cluster file (usually named ``fdb.cluster``) to connect to a cluster. The contents of the cluster file are the same for all processes that connect to the cluster. An ``fdb.cluster`` file is created automatically when you install a FoundationDB server and updated automatically when you [change coordination servers](https://apple.github.io/foundationdb/configuration.html#configuration-choosing-coordination-servers). To connect to a cluster from a client machine, you will need access to a copy of the cluster file used by the servers in the cluster. - -## ⚠️ Wire protocol - -FoundationDB's wire protocol is not compatible between minors releases, i.e. client version 6.2 will **not** be able to communicate with 6.3.X, 6.1.X, and all versions different from 6.3.X. - -On the bindings-side, the client will be [stalling](https://forums.foundationdb.org/t/detecting-a-version-mismatch/3055/2). On the server's logs, you would see events like `ConnectionRejected` with a reason `IncompatibleProtocolVersion`. \ No newline at end of file diff --git a/src/internals/roles.md b/src/internals/roles.md new file mode 100644 index 0000000..b949772 --- /dev/null +++ b/src/internals/roles.md @@ -0,0 +1,46 @@ +# Anatomy of a Cluster: Roles + + + +FoundationDB's architecture is built on a collection of specialized, stateless roles. This separation of concerns is a key reason for its high performance, scalability, and fault tolerance. A running `fdbserver` process can dynamically take on any of these roles as needed. Understanding them is the first step to understanding how FoundationDB works. + +Here are the core roles in a FoundationDB cluster: + +## The Coordinator + +The **Coordinator** is the first process that any client or server connects to when joining the cluster. Its primary job is to manage the *cluster file*, a small, durable text file that contains the IP addresses and ports of the coordinators themselves. The coordinators elect a **Cluster Controller**, which serves as the singleton brain of the cluster. + +## The Cluster Controller + +The **Cluster Controller** is the authoritative monitor for the entire cluster. There is only one active Cluster Controller at any time. It is responsible for: + +* Monitoring the health of all other `fdbserver` processes. +* Recruiting new processes to take on roles as needed (e.g., if a Log Server fails). +* Orchestrating recovery when a process fails. + +## The Proxy + +The **Proxy** (specifically, the Commit Proxy) is the front door for all transactions. When a client commits a transaction, it sends its read and write sets to a Proxy. The Proxy is responsible for: + +1. Assigning a **Read Version** to incoming transactions. +2. Sending the transaction to the Resolver to check for conflicts. +3. If the transaction is valid, sending it to the Log Servers to be made durable. +4. Reporting the final commit status back to the client. + +Because proxies are stateless, you can add more of them to the cluster to increase transaction throughput. + +## The Resolver + +The **Resolver** is the component that enforces serializability. During the commit process, the Proxy sends the transaction's read set to the Resolver. The Resolver checks if any of the keys in the read set have been modified by another transaction that has committed since the current transaction's read version was assigned. If a conflict is found, the transaction is rejected, and the client must retry. + +## The Log Server + +The **Log Server** is the heart of FoundationDB's durability guarantee. It implements the transaction log. When a transaction is ready to be committed, its mutations are sent to the Log Servers, which write them to durable storage (typically an SSD) before the commit is acknowledged to the client. The Log Servers do not need to apply the changes to the main data store; they just need to record them. + +## The Storage Server + +The **Storage Server** is responsible for storing the data. Each Storage Server holds a set of key ranges (shards). It serves read requests from clients and receives committed mutations from the Log Servers, applying them to its in-memory B-tree and eventually writing them to disk. Storage Servers are the workhorses of the cluster, and you can add more of them to increase both storage capacity and I/O performance. + +## The Data Distributor + +The **Data Distributor** is a background role responsible for ensuring that data is evenly distributed and replicated across all of the Storage Servers. It monitors the size and workload of each shard and will automatically move data between servers to prevent hotspots and ensure fault tolerance. It is also responsible for initiating data replication and healing the cluster after a Storage Server fails. diff --git a/src/internals/the-read-path.md b/src/internals/the-read-path.md new file mode 100644 index 0000000..18232a3 --- /dev/null +++ b/src/internals/the-read-path.md @@ -0,0 +1,34 @@ +# The Read Path + + + +FoundationDB's read path is designed to be highly scalable and efficient. Unlike the write path, which is coordinated through a central set of proxies, the read path is almost entirely decentralized. This allows the cluster to serve a massive number of concurrent reads without creating bottlenecks. + +Here is a high-level overview of how a read operation works: + +```mermaid +sequenceDiagram + participant Client + participant Proxy + participant Storage Server + + Client->>+Proxy: 1. Get Read Version + Proxy-->>-Client: read_version + Client->>Client: 2. Locate Storage Server (from cache) + Client->>+Storage Server: 3. Read(key, read_version) + Storage Server-->>-Client: Value +``` + +### The Steps of a Read + +1. **Get Read Version:** When a client begins a transaction, the first thing it does is request a **Read Version** from a **Proxy**. This version is a timestamp that represents a consistent, immutable snapshot of the entire database. All reads within the transaction will be served from this snapshot, which is the foundation of FoundationDB's snapshot isolation. + +2. **Locate the Storage Server:** The client library maintains a local cache that maps key ranges to the **Storage Servers** responsible for them. When the client needs to read a key, it uses this cache to determine which Storage Server to contact. This lookup is extremely fast and does not require a network round-trip. + +3. **Read from the Storage Server:** The client connects directly to the appropriate Storage Server and requests the value for the key at the transaction's Read Version. The Storage Server uses its in-memory B-tree and on-disk data files to find the correct version of the value and return it to the client. + +### Key Takeaways + +* **Decentralized and Scalable:** Because clients read directly from Storage Servers, read throughput can be scaled horizontally simply by adding more Storage Servers to the cluster. +* **Snapshot Isolation:** The use of a Read Version ensures that a transaction sees a perfectly consistent view of the database, even as other transactions are being committed concurrently. Your reads are never "dirty." +* **Low Latency:** By caching the mapping of keys to Storage Servers, the client can avoid extra network hops and read data with very low latency. diff --git a/src/internals/the-write-path.md b/src/internals/the-write-path.md new file mode 100644 index 0000000..7e3ea91 --- /dev/null +++ b/src/internals/the-write-path.md @@ -0,0 +1,47 @@ +# The Write Path + + + +Understanding the life of a write transaction is key to understanding how FoundationDB provides its powerful guarantees of strict serializability and durability. The process involves a carefully choreographed dance between several cluster roles. + +Let's walk through the journey of a transaction from the moment a client calls `commit()` to the point where it is safely stored in the database. + +```mermaid +sequenceDiagram + participant Client + participant Proxy + participant Resolver + participant Log Server + participant Storage Server + + Client->>+Proxy: commit(read_set, write_set) + Proxy->>Proxy: 1. Get Read Version + Proxy->>+Resolver: 2. Resolve(read_version, read_set) + Resolver-->>-Proxy: OK (no conflicts) + Proxy->>+Log Server: 3. Log(commit_version, write_set) + Log Server-->>-Proxy: 4. Durable + Proxy-->>-Client: 5. Commit Successful + loop Later + Storage Server->>+Log Server: Pull mutations + Log Server-->>-Storage Server: Mutations + Storage Server->>Storage Server: Apply to B-Tree + end +``` + +### The Steps of a Commit + +1. **Get Read Version:** When the transaction is ready to be committed, the client library sends it to a **Proxy**. The first thing the Proxy does is request a **Read Version** from the Cluster Controller. This version number establishes the logical point in time at which the transaction's reads occurred. + +2. **Conflict Resolution:** The Proxy sends the transaction's Read Version and its *read set* (the list of keys it read) to the **Resolver**. The Resolver checks if any of the keys in the read set have been written to by another transaction that committed *after* this transaction's Read Version. If a conflict is detected, the Resolver tells the Proxy to reject the transaction, and the client must retry. + +3. **Logging for Durability:** If the Resolver finds no conflicts, the Proxy assigns the transaction a **Commit Version** (which will be higher than its Read Version) and sends the transaction's *write set* (the keys and values to be written) to the **Log Servers**. + +4. **Durable Commit:** The Log Servers write the transaction's mutations to their durable, on-disk transaction logs. Once a quorum of Log Servers has confirmed that the data is safely on disk, they respond to the Proxy. At this point, the transaction is considered **durable**. Even if the entire cluster lost power, the transaction would be recovered. + +5. **Success!** The Proxy, having received confirmation from the Log Servers, reports back to the client that the commit was successful. + +### The Final Step: Data Storage + +Notice that the **Storage Servers** were not involved in the critical commit path. This is a key design decision that makes commits extremely fast. + +After the transaction is durable, the Storage Servers will eventually pull the new data from the Log Servers and apply the mutations to their own on-disk B-trees. This process happens asynchronously in the background and does not block new incoming transactions. This separation of the transaction log from the primary data store is a pattern known as Command Query Responsibility Segregation (CQRS) and is fundamental to FoundationDB's performance. diff --git a/src/meet_fdb/another_db.md b/src/meet_fdb/another_db.md index 9e33e48..d1bc429 100644 --- a/src/meet_fdb/another_db.md +++ b/src/meet_fdb/another_db.md @@ -1,132 +1,120 @@ -# Yet another database? +# Yet Another Database? -> Another database? There is so many of them 😑 +> "Another database? There are so many of them!" -You are right, we are living in the golden age of data, and as such, we have many options to store our data. +You're right. We live in a golden age of data, which means we have a dizzying number of options for storing it. -According to the [Database of Databases](https://dbdb.io/), you can choose between **795 database management systems**. Yes, you read it right, almost 800 different databases to compare 😱 +According to the [Database of Databases](https://dbdb.io/), there are nearly **800 different database management systems** to choose from. This abundance of choice can be overwhelming. -## How to choose a database? +## How Are Databases Different? -If you click on the [Browse button](https://dbdb.io/browse), you will be able to go through the different criteria: +When choosing a database, engineers evaluate them against a long list of criteria. This leads to a wide variety of **specialized datastores**, each optimized for a different purpose. Key differentiators include: -* is it open-source or not? -* is it an embedded/single-node/distributed database? -* What is the query language? -* is it suited for [OTLP](https://en.wikipedia.org/wiki/Online_transaction_processing) or [OLAP](https://en.wikipedia.org/wiki/Online_analytical_processing) workloads? -* What is the data model? -* What is the scalability limits? -* Is it transactional? -* Is there indexes? -* Is it a row-oriented storage, or maybe columnar? -* is there stored procedures and materialized views? -* and so on. +* **Data Model:** Is it a document, column-family, key-value, graph, or relational database? +* **Workload:** Is it designed for transactional (OLTP) or analytical (OLAP) workloads? +* **Architecture:** Is it embedded, single-node, or distributed? +* **Transactions:** Does it support ACID transactions? +* **Query Language:** Does it use SQL or a proprietary language? +* **Scalability:** What are its performance limits? +* **Licensing:** Is it open-source? +* **Features:** Does it offer secondary indexes, stored procedures, or materialized views? -That's a lot of criteria! But that means we also have a big number of possible combinations, each one creating a potential new database. This is why we have **specialized datastores** like: +## The Rise of Polyglot Persistence -* document -* column -* key-value -* graph -* indexing -* messaging -* and others! +This variety of specialized datastores leads developers to use multiple databases in a single application—a pattern sometimes called "polyglot persistence." -## Stateless applications +For example, a project might require both a relational database for transactional data and a dedicated search index for text search. -This variety of specialized datastore is allowing developers to have multiple datastores as **requirements** for a single infrastructure. +Because managing state is complex, a common architectural pattern is to build **stateless applications** that delegate the complexity of data storage to these specialized, stateful databases. -> Alice: "Hey Bob, what do you need for your next project?" -> -> Bob: "Mmmmh, I will need an SQL database and a messaging systems 🤔". - -Storing data is hard, so we are using them as **durable** strongholds for our data, while making our applications mostly **stateless**. - -So, Bob's architecture looks like this: +A typical architecture might look like this: ```mermaid flowchart TD - id1(Bob's stateless applications) - id2(datastore 1) - id3(datastore 2) - - id1 -- uses --> id2 - id1 -- uses --> id3 + app("Stateless Application") + db1("Datastore 1 (e.g., PostgreSQL)") + db2("Datastore 2 (e.g., Elasticsearch)") + + app -- uses --> db1 + app -- uses --> db2 ``` -## Datastore's architecture +## The Anatomy of a Datastore -We said earlier that there is a lot of criteria to choose a database, but we can narrow to 3: +While databases differ in many ways, we can simplify their architecture into three core components: -* What is the **Query language**, -* What is the **data model**, -* What is the **Storage Engine**. +* The **Query Language:** The interface for interacting with the data. +* The **Data Model:** The way the data is structured and presented (e.g., relational, document). +* The **Storage Engine:** The underlying component responsible for durably storing and retrieving data on disk. -For example, PostgreSQL is exposing relational data through the SQL language and storing files on a single node. +For example, PostgreSQL exposes a relational data model via the SQL query language, all running on a storage engine designed for a single node. -Let's update the flowchart: +Let's update our diagram to show this breakdown: ```mermaid flowchart TD - id1(Bob's stateless applications) - ql1(Query Language 1) - ql2(Query Language 2) - dm1(Data Model 1) - dm2(Data Model 2) - se1(Storage Engine 1) - se2(Storage Engine 2) - - id1 -- uses --> ql1 - ql1 -- to access data as --> dm1 - dm1 -- which are stored in --> se1 - id1 -- uses --> ql2 - ql2 -- to access data as --> dm2 - dm2 -- which are stored in --> se2 + subgraph "System Architecture" + app("Stateless Application") + + subgraph "Datastore 1" + ql1("Query Language 1") --> dm1("Data Model 1") --> se1("Storage Engine 1") + end + + subgraph "Datastore 2" + ql2("Query Language 2") --> dm2("Data Model 2") --> se2("Storage Engine 2") + end + + app -- "uses" --> ql1 + app -- "uses" --> ql2 + end ``` -Document databases, column-oriented, row-oriented, JSON, key-value, etc. all make sense in the right context, and often different parts of an application call for different choices. That means we cannot mutualize the Query language and the data models. What's left is the **storage engine**. +While the query language and data model are often specialized for a particular use case, what if we could consolidate the storage layer? -## Sharing the storage engine 🤔 +## The Case for a Shared Storage Engine -Let's mutualize the storage Engine! +Let's imagine we could use a single, powerful storage engine for all our data models. ```mermaid flowchart TD - id1(Bob's stateless applications) - ql1(Query Language 1) - ql2(Query Language 2) - dm1(Data Model 1) - dm2(Data Model 2) - se(Shared storage Engine) - - id1 -- uses --> ql1 - ql1 -- to access data as --> dm1 - dm1 -- which are stored in --> se - id1 -- uses --> ql2 - ql2 -- to access data as --> dm2 - dm2 -- which are stored in --> se -``` + subgraph "System Architecture" + app("Stateless Application") + + subgraph "Stateless Data Layer 1" + direction LR + ql1("Query Language 1") --> dm1("Data Model 1") + end + + subgraph "Stateless Data Layer 2" + direction LR + ql2("Query Language 2") --> dm2("Data Model 2") + end -This design has some great advantages: + se("Shared Storage Engine") -* the storage engine *only need to focus* on storage, -* any components above the storage Engine **has become stateless**. + app -- "uses" --> ql1 + app -- "uses" --> ql2 + + dm1 -- "stores data in" --> se + dm2 -- "stores data in" --> se + end +``` -That could work, but if we have put a lot of contraints on the storage engine, let's talk about what we require. +This design has powerful advantages: -## Requirements for such a storage engine +* **Operational Simplicity:** We only need to manage, scale, and back up one underlying storage system. +* **Stateless Components:** The layers that provide the data models and query languages can themselves become stateless, simplifying their development and deployment. -So, we know have a storage engines that needs to handle multiple types of data. To be cloud-Native, the storage engine needs to be: +However, this places a heavy burden on the shared storage engine. What would it need to provide? -* fault tolerant, -* scalable, -* highly available. +## Requirements for a Universal Storage Engine -while providing: +To be a viable foundation for modern, cloud-native applications, such an engine must be: -* strong semantics, -* flexible data models. +* **Scalable, Fault-Tolerant, and Highly Available:** It must handle growing workloads and survive hardware failures without downtime. +* **Transactional:** It must provide strong consistency guarantees (like ACID) to allow developers to reason about their data correctly. +* **Unopinionated:** It should impose a minimal, flexible data model (like an ordered key-value store) to support various data structures on top. -Now the question is: **can we design such a storage engine?** The answer is **yes**, and it is called FoundationDB. \ No newline at end of file +Can such a storage engine exist? **Yes.** It's called FoundationDB. \ No newline at end of file diff --git a/src/meet_fdb/correctess.md b/src/meet_fdb/correctess.md deleted file mode 100644 index 8f1142b..0000000 --- a/src/meet_fdb/correctess.md +++ /dev/null @@ -1,187 +0,0 @@ -# Correct and robust, choose both - - - -FoundationDB is famously known in the distributed system's community for the simulation framework they developed, unmatched in the industry. In this section, we will go through why FoundationDB is one of the most robust distributed database available. - -## Jepsen - -First, let's introduce [Jepsen](http://jepsen.io/): - -> Jepsen is an effort to improve the safety of distributed databases, queues, consensus systems, etc. We maintain an open source software library for systems testing, as well as blog posts and conference talks exploring particular systems’ failure modes. In each analysis we explore whether the system lives up to its documentation’s claims, file new bugs, and suggest recommendations for operators. - -> Jepsen pushes vendors to make accurate claims and test their software rigorously, helps users choose databases and queues that fit their needs, and teaches engineers how to evaluate distributed systems correctness for themselves. - -Jepsen is now a standard for testing databases, but there is no FoundationDB analysis: - -![aphyr](img/aphyr.png) - -## Flow programming language - -Let's dive in the testing part of FoundationDB, starting with Flow. let's quote the [Engineering page](https://apple.github.io/foundationdb/engineering.html): - -> FoundationDB began with ambitious goals for both high performance per node and scalability. We knew that to achieve these goals we would face serious engineering challenges that would require tool breakthroughs. We’d need efficient asynchronous communicating processes like in Erlang or the Async in .NET, but we’d also need the raw speed, I/O efficiency, and control of C++. To meet these challenges, we developed several new tools, the most important of which is **Flow**, a new programming language that brings actor-based concurrency to C++11. - -Flow is more of a **stateful distributed system framework** than an asynchronous library. It takes a number of highly opinionated stances on how the overall distributed system should be written, and isn’t trying to be a widely reusable building block. - -> Flow adds about 10 keywords to C++11 and is technically a trans-compiler: the Flow compiler reads Flow code and compiles it down to raw C++11, which is then compiled to a native binary with a traditional toolchain. - -Flow was developed before FDB, as stated in this [2013's post](https://news.ycombinator.com/item?id=5319163): - -> FoundationDB founder here. Flow sounds crazy. What hubris to think that you need a new programming language for your project? Three years later: Best decision we ever made. - -> We knew this was going to be a long project so we invested heavily in tools at the beginning. The first two weeks of FoundationDB were building this new programming language to give us the speed of C++ with high level tools for actor-model concurrency. But, the real magic is how Flow enables us to use our real code to do deterministic simulations of a cluster in a single thread. We have a white paper upcoming on this. - -> We've had quite a bit of interest in Flow over the years and I've given several talks on it at meetups/conferences. We've always thought about open-sourcing it... It's not as elegant as some other actor-model languages like Scala or Erlang (see: C++) but it's nice and fast at run-time and really helps productivity vs. writing callbacks, etc. - -> (Fun fact: We've only ever found two bugs in Flow. After the first, we decided that we never wanted a bug again in our programming language. So, we built a program in Python that generates random Flow code and independently-executes it to validate Flow's behavior. This fuzz tester found one more bug, and we've never found another.) - -A very good overview of Flow is available [here](https://apple.github.io/foundationdb/flow.html) and some details [here](https://forums.foundationdb.org/t/why-was-flow-developed/1711/3). - -## Simulation-Driven development - -One of Flow’s most important job is enabling **Simulation**: - -> We wanted FoundationDB to survive failures of machines, networks, disks, clocks, racks, data centers, file systems, etc., so we created a simulation framework closely tied to Flow. By replacing physical interfaces with shims, replacing the main epoll-based run loop with a time-based simulation, and running multiple logical processes as concurrent Flow Actors, Simulation is able to conduct a deterministic simulation of an entire FoundationDB cluster within a single-thread! Even better, we are able to execute this simulation in a deterministic way, enabling us to reproduce problems and add instrumentation ex post facto. This incredible capability enabled us to build FoundationDB exclusively in simulation for the first 18 months and ensure exceptional fault tolerance long before it sent its first real network packet. For a database with as strong a contract as the FoundationDB, testing is crucial, and over the years we have run the equivalent of a trillion CPU-hours of simulated stress testing. - -A good overview of the simulation can be found [here](https://apple.github.io/foundationdb/testing.html). - -You can also have a look at those two awesome talk: - - - - - -Simulation has been made possible by combining: - -* Single-threaded pseudo-concurrency, -* Simulated implementation of all external communication, -* determinism. - -Here's an example of a [testfile](https://github.com/apple/foundationdb/blob/master/tests/slow/SwizzledCycleTest.toml): - -```toml -[[test]] -testTitle = 'SwizzledCycleTest' - - # Goal of the test - [[test.workload]] - testName = 'Cycle' - transactionsPerSecond = 5000.0 - testDuration = 30.0 - expectedRate = 0.01 - - # What will be done concurrently to prevent the goal - - # random clogging - [[test.workload]] - testName = 'RandomClogging' - testDuration = 30.0 - swizzle = 1 - - # reboot machines - [[test.workload]] - testName = 'Attrition' - machinesToKill = 10 - machinesToLeave = 3 - reboot = true - testDuration = 30.0 - - [[test.workload]] - testName = 'Attrition' - machinesToKill = 10 - machinesToLeave = 3 - reboot = true - testDuration = 30.0 - - # Change configuration to trigger a coordination changes - [[test.workload]] - testName = 'ChangeConfig' - maxDelayBeforeChange = 30.0 - coordinators = 'auto' -``` - -The test is splitted into two parts: - -* **The goal**, for example doing transaction pointing to another with thousands of transactions per sec and there should be only 0.01% of success. -* **What will be done to try to prevent the test to succeed**. In this example it will **at the same time**: - - * do random clogging. Which means that **network connections will be stopped** (preventing actors to send and receive packets). Swizzle flag means that a subset of network connections will be stopped and bring back in reverse order, 😳 - * will **poweroff/reboot machines** (attritions) pseudo-randomly while keeping a minimal of three machines, 🤯 - * **change configuration**, which means a coordination changes through multi-paxos for the whole cluster. 😱 - -Keep in mind that all these failures will appears **at the same time!** Do you think that your current **datastore has gone through the same test on a daily basis?** [I think not](https://github.com/etcd-io/etcd/pull/11308). - -Everything is seed-driven, which means that if a faulty seed is found, you can reproduce the bug locally. For example, if the seed `1094093328` is in error, you can just run `./fdbserver -r simulation -f ./SwizzledCycleTest.toml -b on -s 1094093328`: - - - -Everything is deterministic, from the errors, how much time RPCs will take, and even which actor is scheduled. - -## Cooperating with the simulation framework with Buggify - -Having a simulation framework is not enough, and one important aspects of the simulation is called Buggify. It is well explained in [this blogpost](https://transactional.blog/simulation/buggify.html): - -> A deterministic simulation framework with random fault injection provides a testing framework that can find bugs. However, the question is how quickly? If validating the correctness of a network protocol or storage engine, then network or disk fault injection alone would be sufficient to give a high degree of confidence in correctness. The types of dangerous conditions that the code must correctly handle, such as network instability or disk corruption, exactly match what the simulator directly produces. -> -> How FoundationDB does this is with the BUGGIFY macro. BUGGIFY exists to bias the simulator towards doing dangerous, bug-finding things. It is the main tool that differentiates FDB’s simulation testing from other black box solutions. Instead of writing FoundationDB and then trying to validated it against a separate blackbox testing solution afterwards, FoundationDB was written to explicitly cooperate with the simulator by instrumenting its code with descriptions of how to cause failures in each component of the system. -> -> BUGGIFY has the following rules: -> -> * BUGGIFY only ever evaluates to true when run in simulation. -> -> * The first time each BUGGIFY use is evaluated, it is either enabled or disabled for the entire simulation run. -> -> * Enabled uses of BUGGIFY have a 25% chance of evaluating to true (or custom, e.g. BUGGIFY_WITH_PROB(0.001) == 0.1% chance). - -`BUGGIFY` is allowing FDB developers to inject [deterministic delays](https://github.com/apple/foundationdb/blob/07e531947765696c3d0e80703967dd5da420fb28/fdbserver/TLogServer.actor.cpp#L1252-L1256): - -```cpp - if (!self->spillOrder.size()) { - wait(delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL - : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, - TaskPriority::UpdateStorage)); - return Void(); - } - -``` - -Or failures like [bitsFlip](https://github.com/apple/foundationdb/blob/main/fdbrpc/FlowTransport.actor.cpp#L1021-L1027) on RPC calls: - -```cpp -if (g_network->isSimulated() && - g_network->now() - g_simulator.lastConnectionFailure > g_simulator.connectionFailuresDisableDuration && - BUGGIFY_WITH_PROB(0.0001)) { - g_simulator.lastConnectionFailure = g_network->now(); - isBuggifyEnabled = true; - TraceEvent(SevInfo, "BitsFlip").log(); - int flipBits = 32 - (int)floor(log2(deterministicRandom()->randomUInt32())); - - uint32_t firstFlipByteLocation = deterministicRandom()->randomUInt32() % packetLen; -``` - - -## Implications for CI - -Having such a powerful testing environment means that every FDB developer can concentrate on the code, and CI will try to brute-force your code. This has two major consequences: - -* Typically run 100k simulation tests for each PR before reviewing(500 cores for about two hours) -* Critical code require millions of correctness tests. - - -## TL;DR - -> "Simulation’s success has surpassed our expectation and has been vital to our engineering team." -> -> "It seems unlikely that we would have been able to build FoundationDB without this technology." - -We can also quote [Will Wilson](https://youtu.be/fFSPwJFXVlw): - -> The reason why people write tests is because human beings are astonishingly bad at thinking through all the possible branches of control flow that a program could take. -> -> But that very fact means that we're unable to write tests to cover all the things that we actually need to cover, like because if we could, if we were smart enough, or had the right kind of brain to write all the tests we needed to write, then we would have just written the code correctly in the first place. -> -> I think this is like really scary and really true, and the implications is that tests can be useful for turning up regressions, but almost completely useless for telling you about unknown unknowns. -> -> I think this is the real secret sauce behind what FoundationDB did, it's not so much the deterministic simulation although that was a very important part of it. It was that whenever we had a new piece of functionality, we didn't say "how can I write some tests to cover this?". It was more like "how can I write a system that will forever be generating new and interesting tests?" - diff --git a/src/meet_fdb/correctness.md b/src/meet_fdb/correctness.md new file mode 100644 index 0000000..c107fa4 --- /dev/null +++ b/src/meet_fdb/correctness.md @@ -0,0 +1,57 @@ +# A Culture of Correctness + + + +Distributed systems are notoriously difficult to build correctly. The number of possible states, race conditions, and failure modes is astronomical. While tools like [Jepsen](http://jepsen.io/) have become an industry standard for validating the claims of distributed databases, FoundationDB's approach to correctness goes much deeper. It is built on a philosophy of **simulation-driven development** that is unmatched in the industry. + +This chapter explores the layers of this testing strategy, which has allowed FoundationDB to achieve its legendary stability. + +## The Foundation: Flow + +The story of FoundationDB's correctness begins with its programming language: **Flow**. Developed in the first weeks of the project, Flow is a C++ extension that brings actor-based concurrency to the language. As the engineering team explains: + +> We’d need efficient asynchronous communicating processes like in Erlang... but we’d also need the raw speed, I/O efficiency, and control of C++. To meet these challenges, we developed... Flow, a new programming language that brings actor-based concurrency to C++11. + +Flow isn't just a convenience; it's the critical enabler for the entire testing strategy. By controlling the scheduling of actors and abstracting away I/O, Flow makes it possible to run a deterministic simulation of an entire FoundationDB cluster in a single thread. + +## The Engine: Deterministic Simulation + +This leads to the crown jewel of the testing suite: the simulation framework. For the first 18 months of its development, FoundationDB never sent a single packet over a real network. It was built and tested entirely in simulation. + +How does it work? + +* **Single-Threaded Execution:** The entire cluster—every logical process, client, and server—runs as a set of actors within a single OS thread. +* **Simulated Interfaces:** All external communication, including network, disk, and time, is replaced with a deterministic, in-memory simulation. The network is not reliable; it can be partitioned, delayed, and reordered by the simulator. +* **Perfect Reproducibility:** Because the simulation is single-threaded and the inputs are controlled by a random seed, any test run is perfectly deterministic. If a test fails with a specific seed, a developer can reproduce the *exact* sequence of events that led to the failure, down to the scheduling of individual actors. + +This allows for a level of testing that is impossible with traditional methods. The team has run the equivalent of a **trillion CPU-hours** of simulated stress testing, exploring state spaces that would be impossible to cover in the real world. + + + +## The Fuel: Generative Testing and Buggify + +Deterministic simulation is necessary, but not sufficient. As one engineer put it: + +> The reason why people write tests is because human beings are astonishingly bad at thinking through all the possible branches of control flow... that very fact means that we're unable to write tests to cover all the things that we actually need to cover. + +Instead of trying to write specific tests for every scenario, the FoundationDB team built a system for **generating new and interesting tests**. A test in FoundationDB is not a simple unit test; it's a combination of a **workload** (the goal) and a set of **chaos agents** (things trying to break the goal). + +For example, a test might specify a workload of 5,000 transactions per second while simultaneously: + +* **Clogging the network:** Randomly stopping and reordering network packets. +* **Killing machines:** Randomly rebooting virtual servers. +* **Changing the configuration:** Forcing the cluster to re-elect its coordinators. + +To make this even more powerful, developers use a macro called `BUGGIFY`. This macro allows them to explicitly cooperate with the simulator by instrumenting the code with potential failure points. For example, a developer can wrap a piece of code in `BUGGIFY` to tell the simulator, "This is an interesting place to inject a 10-second delay, but only 1% of the time." + +This allows the simulation to explore not just external failures (like network partitions) but also internal, heisenbug-like conditions in a controlled and deterministic way. + +## The Result: Confidence at Scale + +The implications of this approach are profound: + +* **CI as a Brute-Force Weapon:** Every pull request is subjected to hundreds of thousands of simulation tests, running on hundreds of cores for hours, before a human even begins a code review. +* **Focus on Invention, Not Regression:** Developers can focus on building new features, confident that the CI system will relentlessly probe their code for correctness against a chaotic world of failures. + +As the original team said, "It seems unlikely that we would have been able to build FoundationDB without this technology." It is this deep, foundational commitment to correctness that makes FoundationDB one of the most robust and trustworthy databases in the world. + diff --git a/src/meet_fdb/enter_fdb.md b/src/meet_fdb/enter_fdb.md index c984296..6c31326 100644 --- a/src/meet_fdb/enter_fdb.md +++ b/src/meet_fdb/enter_fdb.md @@ -2,82 +2,80 @@ -Let's start by quoting the [official overview](https://apple.github.io/foundationdb/): +As the [official overview](https://apple.github.io/foundationdb/) puts it: > FoundationDB is a distributed database designed to handle large volumes of structured data across clusters of commodity servers. It organizes data as an ordered key-value store and employs ACID transactions for all operations. It is especially well-suited for read/write workloads but also has excellent performance for write-intensive workloads. -FoundationDB is an open-source(Apache V2), distributed key-value store written in Flow, an async-first language that targets C++ specially developed for the database. +Let's unpack what makes FoundationDB unique. -Like all key-value stores, it looks like an infinite dictionary/map from programming, allowing you to store key and values in bytes. +## The Core: An Ordered, Transactional Key-Value Store -Keys are lexicographic order, which means: +At its heart, FoundationDB is a distributed, open-source (Apache 2.0) key-value store. Think of it as a massive, sorted dictionary where both keys and values are simple byte strings. -* ``'0'`` is sorted before ``'1'`` -* ``'apple'`` is sorted before ``'banana'`` -* ``'apple'`` is sorted before ``'apple123'`` -* keys starting with ``'mytable\'`` are sorted together (e.g. ``'mytable\row1'``, ``'mytable\row2'``, ...) +The keys are stored in lexicographical order, which means you can efficiently scan ranges of keys. This simple but powerful feature is the basis for building complex data models. For example: -One huge advantage of FDB is that it supports **multi-key strictly serializable transactions**. Let's break-down the words in reverse order: +* `'user:alice'` comes before `'user:bob'` +* `'user:bob'` comes before `'user:bob:profile'` +* All keys prefixed with `'table1:'` are grouped together, allowing you to simulate rows in a table. -* **transactions**: All reads and writes in FoundationDB are accomplished using transactions. These transactions are fully ACID (Atomic, Consistent, Isolated, and Durable) and span across multiple machines with high performance. -* **Serializable**: this means that the outcome of concurrent transaction is equal to a serial execution. -* **Strictly**: transactions are strictly ordered -* **Multi-key**: you can write across regions/shards +### Multi-Key ACID Transactions -The full list of features is available [here](https://apple.github.io/foundationdb/features.html), and you will also finds a [anti-features list](https://apple.github.io/foundationdb/anti-features.html). +The most important feature of FoundationDB is its support for **multi-key, strictly serializable transactions**. This is a rare and powerful guarantee for a distributed database. -## FoundationDB as a database +* **Transactions:** All operations, including reads and writes, are performed within a transaction. These transactions are fully ACID (Atomic, Consistent, Isolated, and Durable), even across multiple machines. +* **Multi-Key:** A single transaction can read and write multiple, unrelated keys, no matter where they are stored in the cluster. +* **Strictly Serializable:** This is the strongest isolation level. It ensures that the result of concurrent transactions is equivalent to them running one at a time in some sequential order. This makes writing correct applications dramatically simpler, as you are protected from a wide range of subtle race conditions. -FoundationDB provides amazing performance on commodity hardware, allowing you to support very heavy loads at low cost. The official [performance page](https://apple.github.io/foundationdb/performance.html) is giving us some insights: +You can find a full list of [features](https://apple.github.io/foundationdb/features.html) and, just as importantly, [anti-features](https://apple.github.io/foundationdb/anti-features.html) in the official documentation. -![fdb_perf](https://apple.github.io/foundationdb/_images/scaling.png) +## The Powerhouse: Performance and Reliability -> Here, a cluster of commodity hardware scales to 8.2 million operations/sec doing a 90% read and 10% write workload with 16 byte keys and values between 8 and 100 bytes. +FoundationDB is not just a theoretical model; it's a battle-tested engine built for performance and reliability on commodity hardware. -You can expect sub-millisecond performance for small reads, without any tuning. +* **Performance:** It delivers linear scalability and high performance, often achieving millions of operations per second on a cluster. You can expect sub-millisecond latencies for many workloads without any special tuning. +* **Reliability:** It is designed to be fault-tolerant, easy to manage, and simple to grow. Its reliability is backed by an unmatched testing system based on a **deterministic simulation engine**, which we will explore later in this book. -Beside having huge performance, FoundationDB is easy to **install, grow, manage and fault tolerant**. It has been running in production for years in companies like Apple or Snowflake. Backing FoundationDB up is an unmatched testing system based on a **deterministic simulation engine** that will be described later in the book. +![FoundationDB Performance Scaling](https://apple.github.io/foundationdb/_images/scaling.png) +> A cluster of commodity hardware scaling to 8.2 million operations/sec on a 90% read, 10% write workload. -## FoundationDB as a database-framework +## The Ecosystem: A Foundation for Layers -For developers, FoundationDB can be seen as a **database-framework**: it decouples its data storage technology from its data model, allowing you to write **"layers"**: stateless applications that will use FoundationDB as their storage. +Because FoundationDB provides such a powerful and reliable core, it can serve as a universal storage engine—a foundation for building other data models. These are called **"layers."** +A layer is a stateless component that maps a high-level data model (like a document, graph, or relational model) to FoundationDB's simple key-value model. ```mermaid -flowchart TD - fdb(FoundationDB) - l1(Layer 1) - l2(Layer 2) - ln(Layer N) - - l1 -- uses --> fdb - l2 -- uses --> fdb - ln -- uses --> fdb +graph TD + subgraph "Your Application" + l1("Document Layer") + l2("Graph Layer") + l3("Queue Layer") + end + + subgraph "FoundationDB Cluster" + fdb[("Ordered Key-Value Store
ACID Transactions")] + end + + l1 -- "stores data in" --> fdb + l2 -- "stores data in" --> fdb + l3 -- "stores data in" --> fdb ``` -Each layer can expose high level data models. They can be developed as libraries or stateless services. And because of FDB performance, they are easy to scale. +This architecture decouples the data model from data storage, allowing developers to focus on building features without reinventing the complexities of a distributed database. -## A brief history of FoundationDB +## A Brief History -FoundationDB started first as a company in 2009. The FoundationDB Alpha program began in January 2012 and concluded on March 4, 2013 with their public Beta release. +FoundationDB began as a company in 2009. After a successful beta program, version 1.0 was released in 2013. Apple acquired the company in 2015 and subsequently open-sourced the project under the Apache 2.0 license in 2018, making it available to the wider community. -Their 1.0 version was released for general availability on August 20, 2013. On March 24, 2015 it was reported that Apple has acquired the company. +## Who Uses FoundationDB? -On April 19, 2018, Apple open sourced the software, releasing it under the Apache 2.0 license. - -## Who is using FoundationDB? - -Many companies are using FDB, including: - -* **Apple iCloud**: they are the largest users. Billions of logical databases are stored in FDB (one per user per application). You will find more details about this on the Record-layer chapter. -* **Snowflake** is storing all their metadatas in FDB, -* **VMWare** Tanzu (Formerly Wavefront), -* **IBM** (Apache CouchDB), -* **eBay**, -* **Epic Games**, -* ... +FoundationDB is the storage engine behind critical systems at major technology companies, including: +* **Apple:** A massive-scale deployment for iCloud, where it stores billions of logical databases. +* **Snowflake:** Stores all metadata for its cloud data platform. +* **VMware:** Used in the Tanzu observability platform. +* And many others, including **IBM**, **eBay**, and **Epic Games**. ## TL;DR -FoundationDB is a scalable, robust, distributed key-value store that you can use as a framework to write your own ~~database~~ layer 🤯 \ No newline at end of file +FoundationDB is a scalable, distributed key-value store with strictly serializable ACID transactions. It's so powerful and reliable that it serves as a universal foundation for building any data model you can imagine. \ No newline at end of file diff --git a/src/meet_fdb/everything_is_kv.md b/src/meet_fdb/everything_is_kv.md index 356f548..2b8c099 100644 --- a/src/meet_fdb/everything_is_kv.md +++ b/src/meet_fdb/everything_is_kv.md @@ -1,99 +1,52 @@ -# Everything is a key-value! +# Modeling on a Key-Value Store -> Ok, so I could store a table-like data in a key-value? +> "If FoundationDB is just a key-value store, how can it power complex applications?" -Yes, this is exactly the idea. It may seem weird at first, but it is a common pattern among new databases. In this section, we will briefly go over some examples. +This is the crucial question. The answer lies in a powerful, common architectural pattern: building rich data models on top of a simple, ordered key-value core. By defining a specific way to encode data structures into keys and values, you can represent almost anything, from relational tables to complex documents and graphs. -## Yugabyte +This is not a new or niche idea. Many of the most successful and scalable modern databases are built using this exact layered architecture. Let's look at a few examples. -From the official [documentation](https://docs.yugabyte.com/latest/architecture/layered-architecture/): +## The Pattern: A Tale of Two Layers -> YugabyteDB architecture follows a layered design. It is comprised of 2 logical layers as shown in the diagram below: -> -> * Yugabyte Query Layer -> -> * DocDB distributed document store +Most modern databases can be conceptually divided into two layers: -Information about key encoding format can be found [here](https://github.com/YugaByte/yugabyte-db/wiki/Low-level-DocDB-key-encoding-format) and [here](https://youtu.be/DAFQcYXK2-o?t=1523). +1. **The Storage Layer:** A low-level engine, often a key-value store, responsible for the distributed storage, replication, and transactional integrity of data. +2. **The Data Model Layer:** A higher-level component that exposes a rich data model (e.g., SQL, Document, Graph) and translates queries into operations on the underlying storage layer. -## F1 and Spanner from Google +This separation of concerns allows each layer to do what it does best. -There is a lot of contents about Google's datastores and their evolution. +## Industry Examples -At first, they built [Megastore](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36971.pdf) on top of BigTable: +This layered pattern appears again and again in the architecture of leading databases. -> In brief, it provides fully serializable ACID semantics over distant replicas with low enough latencies to support interactive applications. -> -> We use Google’s Bigtable for scalable fault-tolerant storage within a single datacenter, allowing us to support arbitrary read and write throughput by spreading operations across multiple rows. +### SQL on Key-Value: Google, CockroachDB, and TiDB -Then, the first version of [Spanner](https://www.usenix.org/system/files/conference/osdi12/osdi12-final-16.pdf) appeared: +Several of the most prominent distributed SQL databases are built on a key-value core. -> Spanner is a scalable, globally-distributed database designed, built, and deployed at Google. -> -> Spanner has evolved from a Bigtable-like versioned key-value store into a temporal multi-version database. Data is stored in schematized semi-relational tables; data is versioned, and each version is automatically timestamped with its commit time +* **Google's Spanner and F1:** Google's database journey shows a clear evolution. [Megastore](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36971.pdf) provided ACID semantics on top of the Bigtable key-value store. This evolved into [Spanner](https://www.usenix.org/system/files/conference/osdi12/osdi12-final-16.pdf), which started as a key-value store and grew into a full-fledged [relational database](https://storage.googleapis.com/pub-tools-public-publication-data/pdf/acac3b090a577348a7106d09c051c493298ccb1d.pdf). The key insight is that the SQL data model is a layer on top of a scalable, transactional key-value foundation. -Then they added [F1](https://storage.googleapis.com/pub-tools-public-publication-data/pdf/41344.pdf) on top of Spanner: +* **CockroachDB:** As described in their architecture documentation, CockroachDB maps all [SQL table and index data](https://www.cockroachlabs.com/blog/sql-in-cockroachdb-mapping-table-data-to-key-value-storage/) directly into its underlying monolithic sorted key-value map. -> F1 is a fault-tolerant globally-distributed OLTP and OLAP database built at Google as the new storage system for Google’s AdWords system. It was designed to replace a sharded MySQL implementation that was not able to meet our growing scalability and reliability requirements. -> -> F1 is built on top of Spanner +* **TiDB:** The TiDB ecosystem explicitly separates its components. [TiDB](https://www.vldb.org/pvldb/vol13/p3072-huang.pdf) is the SQL computation layer, while **TiKV** is the distributed, transactional key-value storage layer. Each SQL row is mapped to a key-value pair in TiKV. -And finally, [Spanner itself became an SQL system](https://storage.googleapis.com/pub-tools-public-publication-data/pdf/acac3b090a577348a7106d09c051c493298ccb1d.pdf): +### Multi-Model Databases: Cosmos DB and YugabyteDB -> Google’s Spanner started out as a key-value store offering multi-row transactions, external consistency, and transparent failover across datacenters. Over the past 7 years it has evolved into a relational database system +Other databases use this pattern to support multiple data models on a single, unified backend. -It seems it is still organized as layers: +* **Azure Cosmos DB:** Microsoft's global-scale database projects multiple data models (Document, Graph, Key-Value) over a [minimalist core data model](http://muratbuffalo.blogspot.com/2018/08/azure-cosmos-db.html). The storage engine itself is agnostic to whether it's storing a document or a graph node. -> Like the lower storage and transactional layers, +* **YugabyteDB:** Follows a similar [layered design](https://docs.yugabyte.com/latest/architecture/layered-architecture/), with a query layer that supports both SQL and Cassandra APIs on top of **DocDB**, its underlying distributed document store, which itself functions as a key-value store. -## TiDB / TiKV +## The Unbundled Database: FoundationDB -In the titanium stack, TiKV is the storage layer for [TiDB](https://www.vldb.org/pvldb/vol13/p3072-huang.pdf): +All these examples point to a powerful conclusion: many modern databases are, internally, a specialized data model layer tightly bundled with a general-purpose key-value storage engine. -> It has three core components: a distributed storage layer, a Placement Driver(PD), and a computation engine layer. -> -> The distributed storage layer consists of a row store (TiKV) and a columnar store (TiFlash). Logically, the data stored in TiKV is an ordered key-value map. Each tuple is mapped into a key-value pair. The key is composed of its table ID and row ID, and the value is the actual row data, where the table ID and row ID are unique integers +FoundationDB's philosophy is to **unbundle** these two layers. It provides *only* the core storage engine, but it makes that engine more powerful and generic than any of the bundled equivalents. It gives you: -The table codec can be found [here](https://github.com/pingcap/tidb/tree/master/tablecodec). +* An ordered key-value store. +* Strictly serializable ACID transactions. +* Exceptional performance and proven reliability. -## CosmosDB - - -More details can be found [here](http://muratbuffalo.blogspot.com/2018/08/azure-cosmos-db.html): - -> Cosmos DB supports and projects multiple data models (documents, graphs, key-value, table, etc.) over a minimalist type system and core data model: the atom-record-sequence (ARS) model. -> -> Container and item resources are further projected as reified resource types for a specific type of API interface. For example, while using document-oriented APIs, container and item resources are projected as collection and document resources respectively. - -The automatic indexing strategy is described [here](http://www.vldb.org/pvldb/vol8/p1668-shukla.pdf). - -## CockroachDB - -This old [blogpost](https://www.cockroachlabs.com/blog/sql-in-cockroachdb-mapping-table-data-to-key-value-storage/) describes the idea: - -> The CockroachDB SQL system is built on top of the internal CockroachDB key-value store and leverages the monolithic sorted key-value map to store all of the SQL table data and indexes. - - -The encoding notes can be found [here](https://github.com/cockroachdb/cockroach/blob/master/docs/tech-notes/encoding.md). - -## Dgraph - -From the official [documentation](https://dgraph.io/docs/dgraph-overview/#dgraph-architecture): - -> Dgraph is a single layer in your tech stack, but inside the inner workings of a Dgraph database instance, there are three distinct entities: -> -> * Badger - Dgraph’s custom-built key-value store -> * Ristretto - Dgraph’s custom-built cache -> * Dgraph - the methods and algorithms used to parse DQL (and now GraphQL) and act accordingly - -## Examples from layers - -### Record-Layer - -The [Record-layer](https://foundationdb.github.io/fdb-record-layer/) is using Protobuf to store data. [A single Protobuf message is written across multiple rows](https://github.com/FoundationDB/fdb-record-layer/blob/1715c4dc2dd5565f292003a1f45d87fe14b32ca7/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/SplitHelper.java#L117). It can be encrypted and compressed. - -### Document-layer - -The [Document-layer](https://github.com/FoundationDB/fdb-document-layer) is writing a document [across multiple keys](https://github.com/FoundationDB/fdb-document-layer/blob/2250bfb6d3c5bd5007bca39ed92b872f2b0dc4b2/src/ExtUtil.actor.cpp#L77) +This frees you, the developer, to build *any* data model layer you can imagine. You get the power of a world-class distributed storage engine without being locked into a specific, high-level data model. FoundationDB is the ultimate realization of the layered database architecture. diff --git a/src/the-record-layer/quick.md b/src/the-record-layer/quick.md new file mode 100644 index 0000000..ae65ec1 --- /dev/null +++ b/src/the-record-layer/quick.md @@ -0,0 +1 @@ +# QuiCK diff --git a/src/the-record-layer/what-is-record-layer.md b/src/the-record-layer/what-is-record-layer.md new file mode 100644 index 0000000..568d348 --- /dev/null +++ b/src/the-record-layer/what-is-record-layer.md @@ -0,0 +1 @@ +# What is the Record-layer? diff --git a/src/welcome.md b/src/welcome.md index b319258..aeacaea 100644 --- a/src/welcome.md +++ b/src/welcome.md @@ -1,18 +1,19 @@ ![FDB_logo.png](FDB_logo.png) -# The FoundationDB's Book 📖 +# The FoundationDB Book 📖 -Welcome to FoundationDB's Book! If you are looking to start writing your own datastore with FoundationDB, or if you are curious about FDB, you have come to the right place. +Welcome! Whether you're building your own datastore with FoundationDB or are simply curious about its capabilities, you've come to the right place. -🚧 This is currently a work-in-progress, feel free to [contribute](https://github.com/PierreZ/fdb-book) chapters! 🚧 +🚧 This book is a work in progress. We welcome contributions! Feel free to [open a pull request](https://github.com/PierreZ/fdb-book) with your improvements. 🚧 -## What This Book Covers +## Who This Book Is For -This book aims to be a comprehensive, up-to-date guide to use FoundationDB features and libraries, appropriate for beginners and old hands alike. +This book is for developers, architects, and database enthusiasts who want to understand and leverage FoundationDB. It serves as a comprehensive guide, linking to the [official documentation](https://apple.github.io/foundationdb/) and the [community forum](https://forums.foundationdb.org/) when deeper dives are needed. -It is designed as **an entrypoint towards the FoundationDB's community**, with links pointing to the [official documentation](https://apple.github.io/foundationdb/) or the [forum](https://forums.foundationdb.org/). +## What This Book Covers -* The early chapters provide an introduction to FoundationDB, why it may interest you, and what is a layer. -* The Record-Layer chapter is focusing on the Record-Layer, a Java library to write layers made by Apple. -* The middle chapters discuss key utilities and features provided by the bindings, and describe best-practises to write layers to maximize performance and scalability. -* The last chapters are describing core elements of FoundationDB, such as read/write path, or the simulation framework. \ No newline at end of file +* **Meet FoundationDB:** We start with an introduction to FoundationDB, exploring what makes it unique and why it might be the right choice for your project. +* **Getting Started:** A hands-on guide to installing FoundationDB and interacting with it using the `fdbcli` command-line tool. +* **Developing a Layer:** Learn the core concepts of building on FDB, from ACID transactions to best practices for data modeling and key design. We'll also look at existing open-source layers for inspiration. +* **FoundationDB's Internals:** Delve into the architecture of FoundationDB, including its read/write paths and powerful simulation framework. +* **The Record Layer:** A dedicated look at the Record Layer, a Java library from Apple for building complex data models on FoundationDB. \ No newline at end of file From 94df5a802ca71d76899d701276f46a7098884918 Mon Sep 17 00:00:00 2001 From: Pierre Zemb Date: Wed, 25 Jun 2025 18:06:14 +0200 Subject: [PATCH 3/4] add record-layer --- README.md | 8 +++ src/getting_started/installation.md | 8 ++- src/internals/roles.md | 22 +++++-- src/internals/the-write-path.md | 4 +- src/meet_fdb/another_db.md | 2 - src/the-record-layer/quick.md | 51 ++++++++++++++- src/the-record-layer/what-is-record-layer.md | 68 +++++++++++++++++++- 7 files changed, 147 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 45aa9f5..549418b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,11 @@ # The FoundationDB Book This repo is holding the source code of `The FoundationDB Book`. Fairly early stage for now. + +## TODOs + +* https://github.com/apple/foundationdb/wiki/Best-practice-of-storing-structured-data,-for-example-JSON-objects +* https://github.com/apple/foundationdb/wiki/Data-Distribution-and-Movement +* https://github.com/apple/foundationdb/wiki/Difference-between-snapshot-reads-and-regular-reads +* https://github.com/apple/foundationdb/wiki/Guidelines-for-Choosing-Coordinators +* https://github.com/apple/foundationdb/wiki/Upgrading-FoundationDB \ No newline at end of file diff --git a/src/getting_started/installation.md b/src/getting_started/installation.md index 75805ec..dab45b6 100644 --- a/src/getting_started/installation.md +++ b/src/getting_started/installation.md @@ -39,8 +39,10 @@ To connect to a cluster, your client machine must have a copy of that cluster's ## ⚠️ A Critical Note on Versioning -FoundationDB enforces strict compatibility between client and server versions. This is a common source of confusion for new users. +FoundationDB enforces strict compatibility between the client library and server processes. This is a common source of confusion for new users. -**The Rule:** The client and server **must** have the same major and minor version numbers. For example, a client with version `7.1.x` can only talk to a server with version `7.1.y`. It **cannot** talk to a server running `7.2.z` or `6.3.w`. +**The Rule:** The installed client library (`libfdb_c`) and the server binaries (`fdbserver`) **must** have the same major and minor version numbers. For example, a client with version `7.1.x` can only talk to a server with version `7.1.y`. It **cannot** talk to a server running `7.2.z` or `6.3.w`. -If you mix versions, your application will fail to connect, often by hanging indefinitely. The server logs will show `ConnectionRejected` errors with the reason `IncompatibleProtocolVersion`. Always ensure your client machines and server cluster are running compatible versions. \ No newline at end of file +However, you can connect to a cluster with an older version by specifying the API version in your client code. For example, if your client machine has the `7.4.x` libraries installed, you can still connect to a `7.3.z` cluster by calling `fdb.select_api_version(730)` before connecting. This mechanism is particularly useful for facilitating rolling upgrades, allowing clients to be upgraded before the servers. + +If you mix incompatible versions without selecting a compatible API version, your application will likely fail to connect, often by hanging indefinitely. The server logs may show `ConnectionRejected` errors with the reason `IncompatibleProtocolVersion`. It's crucial to ensure your client machines and server cluster are running compatible versions, or that you are using `select_api_version` correctly during an upgrade. \ No newline at end of file diff --git a/src/internals/roles.md b/src/internals/roles.md index b949772..aeb8b96 100644 --- a/src/internals/roles.md +++ b/src/internals/roles.md @@ -18,20 +18,28 @@ The **Cluster Controller** is the authoritative monitor for the entire cluster. * Recruiting new processes to take on roles as needed (e.g., if a Log Server fails). * Orchestrating recovery when a process fails. -## The Proxy +## The Proxies -The **Proxy** (specifically, the Commit Proxy) is the front door for all transactions. When a client commits a transaction, it sends its read and write sets to a Proxy. The Proxy is responsible for: +FoundationDB splits the traditional role of a proxy into two distinct components: the **GRV Proxy** and the **Commit Proxy**. This separation allows for better scaling and specialization. Clients first interact with a GRV proxy to start a transaction and then with a Commit Proxy to commit it. -1. Assigning a **Read Version** to incoming transactions. -2. Sending the transaction to the Resolver to check for conflicts. -3. If the transaction is valid, sending it to the Log Servers to be made durable. +### The GRV Proxy + +The **GRV Proxy** (Get Read Version Proxy) is responsible for one critical task: providing a **Read Version** to a client when it begins a transaction. To do this, the GRV proxy communicates with the Master to get the latest committed version from the transaction system. This ensures that the transaction gets a consistent snapshot view of the database. The `Ratekeeper` process can apply backpressure by slowing down the rate at which GRV Proxies issue read versions, which helps manage cluster load. + +### The Commit Proxy + +The **Commit Proxy** is the front door for all transaction *commits*. When a client commits a transaction, it sends its read and write sets to a Commit Proxy. The Commit Proxy orchestrates the second half of the transaction lifecycle: + +1. Getting a **Commit Version** from the Master. +2. Sending the transaction's read and write sets to the **Resolver** to check for conflicts. +3. If the transaction is valid, sending its mutations to the **Log Servers** to be made durable. 4. Reporting the final commit status back to the client. -Because proxies are stateless, you can add more of them to the cluster to increase transaction throughput. +Because both proxy types are stateless, you can add more of them to the cluster to increase both the number of transactions that can be started and the overall commit throughput. ## The Resolver -The **Resolver** is the component that enforces serializability. During the commit process, the Proxy sends the transaction's read set to the Resolver. The Resolver checks if any of the keys in the read set have been modified by another transaction that has committed since the current transaction's read version was assigned. If a conflict is found, the transaction is rejected, and the client must retry. +The **Resolver** is the component that enforces serializability. During the commit process, the Commit Proxy sends the transaction's read and write sets to the Resolver. The Resolver checks if any of the keys in the read set have been modified by another transaction that has committed since the current transaction's read version was assigned. If a conflict is found, the transaction is rejected, and the client must retry. ## The Log Server diff --git a/src/internals/the-write-path.md b/src/internals/the-write-path.md index 7e3ea91..a15ccfd 100644 --- a/src/internals/the-write-path.md +++ b/src/internals/the-write-path.md @@ -16,7 +16,7 @@ sequenceDiagram Client->>+Proxy: commit(read_set, write_set) Proxy->>Proxy: 1. Get Read Version - Proxy->>+Resolver: 2. Resolve(read_version, read_set) + Proxy->>+Resolver: 2. Resolve(read_version, read_set, write_set) Resolver-->>-Proxy: OK (no conflicts) Proxy->>+Log Server: 3. Log(commit_version, write_set) Log Server-->>-Proxy: 4. Durable @@ -32,7 +32,7 @@ sequenceDiagram 1. **Get Read Version:** When the transaction is ready to be committed, the client library sends it to a **Proxy**. The first thing the Proxy does is request a **Read Version** from the Cluster Controller. This version number establishes the logical point in time at which the transaction's reads occurred. -2. **Conflict Resolution:** The Proxy sends the transaction's Read Version and its *read set* (the list of keys it read) to the **Resolver**. The Resolver checks if any of the keys in the read set have been written to by another transaction that committed *after* this transaction's Read Version. If a conflict is detected, the Resolver tells the Proxy to reject the transaction, and the client must retry. +2. **Conflict Resolution:** The Proxy sends the transaction's Read Version, its *read set* (the list of keys it read), and its *write set* to the **Resolver**. The Resolver checks if any of the keys in the read set have been written to by another transaction that committed *after* this transaction's Read Version. If a conflict is detected, the Resolver tells the Proxy to reject the transaction, and the client must retry. 3. **Logging for Durability:** If the Resolver finds no conflicts, the Proxy assigns the transaction a **Commit Version** (which will be higher than its Read Version) and sends the transaction's *write set* (the keys and values to be written) to the **Log Servers**. diff --git a/src/meet_fdb/another_db.md b/src/meet_fdb/another_db.md index d1bc429..c96b8c9 100644 --- a/src/meet_fdb/another_db.md +++ b/src/meet_fdb/another_db.md @@ -83,12 +83,10 @@ flowchart TD app("Stateless Application") subgraph "Stateless Data Layer 1" - direction LR ql1("Query Language 1") --> dm1("Data Model 1") end subgraph "Stateless Data Layer 2" - direction LR ql2("Query Language 2") --> dm2("Data Model 2") end diff --git a/src/the-record-layer/quick.md b/src/the-record-layer/quick.md index ae65ec1..4420c45 100644 --- a/src/the-record-layer/quick.md +++ b/src/the-record-layer/quick.md @@ -1 +1,50 @@ -# QuiCK +# QuiCK: A Queuing System in CloudKit + +QuiCK is a distributed, transactional queuing system developed for and integrated into Apple's CloudKit. It is built on top of FoundationDB and the Record Layer. Its primary purpose is to reliably manage deferred, asynchronous tasks that are generated by CloudKit operations, such as updating search indexes, sending push notifications, or performing data compaction. + +## The Challenge: Queuing at CloudKit Scale + +CloudKit needed a way to manage a massive volume of asynchronous tasks without using a separate, external queuing system. Using an external system like Kafka or RabbitMQ presented several major challenges: + +1. **No Transactionality**: It's impossible to have a single atomic transaction that spans both CloudKit's database (FDB) and an external queue. For example, if a user shares a Keynote document, the system must both update access permissions in the database and enqueue a task to send a push notification. Without a transactional queue, the database update could succeed while the task enqueue fails, leaving collaborators unaware of the share. + +2. **Data Migration**: CloudKit frequently moves user data between FDB clusters for load balancing. If a user's tasks were in a separate system, this would create a coordination nightmare. For example, if a user deletes a folder in iCloud Drive and their database is then moved to a new datacenter, their queued deletion task could be left behind, unable to find the data it's supposed to act on. + +3. **Tenancy Mismatch**: CloudKit has a fine-grained tenancy model with billions of logical databases (one for each user of each app). Traditional queuing systems are designed for thousands of topics, not billions. Mapping the CloudKit model to a traditional queue would be impossible. + +4. **Operational Complexity**: An external system would be another massive, stateful service to provision, monitor, and operate alongside the hundreds of FDB clusters that power CloudKit. + +To solve these issues, the team built QuiCK directly into CloudKit, storing queued tasks right alongside the user data they pertain to. + +## Core Design and Technical Features + +QuiCK's design overcomes the traditional concerns of building a queue on a database (like hotspots and consumer contention) through several key innovations. + +### Two-Level Sharding + +QuiCK avoids hotspots by sharding at an extreme scale: + +* **Level 1: Queue Zones**: The primary level of sharding consists of tens of billions of individual queues, called **Queue Zones**. Each tenant (a user of a CloudKit app) gets their own queue within their logical database. This means one tenant's activity can never create a hotspot that affects another. + +* **Level 2: Cluster Queues**: To help consumers find work efficiently, a second, higher-level queue exists on each FDB cluster. When a task is first enqueued into a tenant's previously empty Queue Zone, the same transaction also adds a **pointer** to that zone into the higher-level Cluster Queue. Consumers poll the Cluster Queue to find these pointers, which efficiently leads them to tenants with work to be done. + +### Fault-Tolerant Leases via Vesting Time + +To prevent multiple consumers from processing the same item, QuiCK uses a clever, fault-tolerant leasing mechanism. Instead of locking or immediately deleting an item, a consumer takes a lease by updating the item's **vesting time** to some point in the future (e.g., 5 minutes from now). This makes the item invisible to other consumers for the duration of the lease. If the consumer processes the item successfully, it deletes it. If the consumer crashes, the lease simply expires, and the item automatically becomes visible again for another consumer to pick up. + +### Polling for Fairness and Efficiency + +Given the massive number of queues, a push-based model is not feasible. Instead, QuiCK uses a polling-based model where a shared pool of consumers asks for work when they have capacity. This allows QuiCK to implement scheduling and fairness policies, deciding which queue to service next based on tenant priority or resource usage, preventing a single user from starving others. + +### Leveraging FoundationDB and the Record Layer + +QuiCK is a powerful example of building a complex subsystem on top of the FDB/Record Layer stack: + +* **Transactional Integrity**: Enqueuing a task and adding a pointer to the cluster queue are atomic operations within a standard FDB transaction. +* **Exactly-Once Semantics**: For tasks that only modify the database (no external side effects), QuiCK can achieve exactly-once semantics by processing the task and deleting it from the queue within a single transaction. +* **Indexed Queues**: The Record Layer's secondary indexes are used to order items within a Queue Zone by priority and vesting time, so consumers always process the most important item first. + +## Further Reading + +* **Paper**: [QuiCK: A Queuing System in CloudKit](https://www.foundationdb.org/files/QuiCK.pdf) (SIGMOD '21) +* **Video**: [QuiCK: A Queuing System in CloudKit](https://www.youtube.com/watch?v=I9mNENkZT90) (SIGMOD '21) diff --git a/src/the-record-layer/what-is-record-layer.md b/src/the-record-layer/what-is-record-layer.md index 568d348..47a4be9 100644 --- a/src/the-record-layer/what-is-record-layer.md +++ b/src/the-record-layer/what-is-record-layer.md @@ -1 +1,67 @@ -# What is the Record-layer? +# What is the Record Layer? + +The FoundationDB Record Layer is an open-source library that provides a record-oriented data store with semantics similar to a relational database, implemented on top of FoundationDB. Think of it as a "middle layer" that provides common database-like features, making it easier to build complex, scalable applications on FDB. + +It was created to solve the common and difficult challenges that arise when building a structured data layer on top of a key-value store, such as schema management, indexing, and query execution, especially in a multi-tenant environment. + +## Core Design Principles + +The Record Layer is built around a few core principles: + +* **Structured, Schematized Data**: It stores structured records using Google's [Protocol Buffers](https://developers.google.com/protocol-buffers). This provides a robust way to define a schema and evolve it over time. + +* **Stateless Architecture**: The layer itself is completely stateless. All state is stored in FoundationDB or returned to the client (e.g., as a `continuation`). This simplifies scaling and operation, as any server can handle any request. + +* **Streaming Queries**: The Record Layer is designed for a streaming model. For example, it only supports ordered queries (like SQL's `ORDER BY`) if there is an index that can produce the data in the requested order. This avoids large, stateful in-memory operations and makes performance predictable, favoring fast OLTP workloads over analytical OLAP queries. + +* **Extensibility**: The layer is highly extensible. Clients can define their own custom index types, index maintainers, and query planner rules, allowing them to tailor the database's behavior to their specific needs. + +## The "Record Store": A Logical Database + +A key abstraction in the Record Layer is the **Record Store**. A Record Store is a logical, self-contained database that holds all of a tenant's records, indexes, and metadata. This entire logical database is stored within a single, contiguous key-space in FoundationDB, called a **subspace**. + +This design is a perfect fit for multi-tenant applications. For example, Apple's CloudKit uses this model to provide a distinct logical database for every application on every user's device—billions of independent databases in total. Because a Record Store is just a range of keys, it can be easily moved between FDB clusters for load balancing. + +## Key Technical Features + +The Record Layer abstracts away several complex engineering problems by leveraging FoundationDB's core features. + +### Key Expressions for Flexible Indexing + +Indexes are defined using **key expressions**, which are functions that specify how to extract data from a record to form an index key. Key expressions can be simple (e.g., a single field's value) or complex. They can: + +* **Concatenate** multiple fields together. +* **Fan out** by creating multiple index entries from a single record, such as indexing each element in a repeated field (a list). +* **Nest** to index fields within a sub-record. + +This provides a powerful and flexible way to create indexes on highly structured, nested data. + +### Online Index Building + +Building an index on a large, live dataset is a hard problem. The Record Layer's online indexer handles this gracefully. When a new index is added, it transitions through several states: + +1. **Write-only**: The index is maintained for all new and updated records, but it cannot yet be used for queries. +2. **Building**: A background process scans all existing records in batches, adding their corresponding entries to the index. This process is transactional, fault-tolerant, and resumable. +3. **Readable**: Once the background build is complete, the index is marked as readable and can be used by the query planner. + +### Advanced Index Types + +The Record Layer includes several powerful, built-in index types: + +* **VALUE**: A standard index that maps the value from a key expression to the record's primary key. +* **ATOMIC**: An index that uses FoundationDB's atomic mutations to maintain aggregate statistics without transaction conflicts. This is used for `SUM`, `COUNT`, `MAX`, and `MIN` indexes. +* **VERSION**: An index on the commit version of a record. This creates a conflict-free, totally-ordered change log, which is ideal for synchronization. The version is a unique 12-byte value: 10 bytes from the FDB commit version and 2 bytes from a transaction-local counter in the Record Layer. +* **RANK**: An index that can answer questions like, "what is the Nth record in a given order?" or "what is the rank of this specific record?" +* **TEXT**: A full-text index for searching for words or phrases within a text field. + +### Query Continuations for Resource Control + +To prevent any single request from consuming too many resources, all long-running operations are pausable. When a query hits a predefined limit (e.g., number of records scanned or time elapsed), it stops and returns the results it has found so far, along with an opaque **continuation**. + +This continuation captures the exact state of the query. The client can pass it back in a new request to resume the query exactly where it left off. This makes the system highly scalable and resilient, as it allows for fine-grained control over resource usage. + +## Further Reading + +* **Paper**: [FoundationDB Record Layer: A Multi-Tenant Structured Datastore](https://www.foundationdb.org/files/record-layer-paper.pdf) (SIGMOD '19) +* **Video**: [FoundationDB Record Layer: Open Source Structured Storage on FoundationDB](https://www.youtube.com/watch?v=SvoUHHM9IKU) (FDB Summit 2018) +* **Video**: [Using FoundationDB and the FDB Record Layer to Build CloudKit](https://www.youtube.com/watch?v=HLE8chgw6LI) (FDB Summit 2018) From 1752dc38badfd3e511915bb06df98ecceaad64bc Mon Sep 17 00:00:00 2001 From: Pierre Zemb Date: Fri, 18 Jul 2025 09:18:56 +0200 Subject: [PATCH 4/4] ci: use nix --- .github/workflows/ci.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 34cc110..2ef4cd8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,17 +12,17 @@ jobs: concurrency: group: ${{ github.workflow }}-${{ github.ref }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - - run: which mdbook || cargo install mdbook --locked --version 0.4.15 - - run: which mdbook-toc || cargo install mdbook-toc --locked --version 0.8.0 - - run: which mdbook-mermaid || cargo install mdbook-mermaid --locked --version 0.9.0 - - run: which mdbook-linkcheck || cargo install mdbook-linkcheck --locked --version 0.7.6 + - uses: cachix/install-nix-action@v27 + with: + extra_nix_config: | + access-tokens = github.com=${{ secrets.GITHUB_TOKEN }} - - run: mdbook build + - run: nix develop --command mdbook build - name: Deploy - uses: peaceiris/actions-gh-pages@v3 + uses: peaceiris/actions-gh-pages@v4 if: ${{ github.ref == 'refs/heads/main' }} with: github_token: ${{ secrets.GITHUB_TOKEN }}