diff --git a/README.md b/README.md index 05f0659d0..a8b268d0a 100644 --- a/README.md +++ b/README.md @@ -433,6 +433,26 @@ http_address: localhost:8080 # If supplied, controls the verbosity of the access logger ("none" or "all"): #access_log_level: none + +# If supplied, declares prometheus metric category names, with allowed value set +# for each category. +#metric_categories: +# branch: +# - main +# user: +# - ci +# product: +# - aaa +# - bbb +# pipeline: +# - unit-test +# - integration-test +# - system-test +# os: +# - rhel8 +# - rhel9 +# - ubuntu21-04 +# - ubuntu22-04 ``` ## Docker @@ -610,3 +630,86 @@ To avoid per-prefix rate limiting with Amazon S3, you may want to try using `--s3.key_format=2`, which stores blobs across a larger number of prefixes. Reference: [Optimizing Amazon S3 Performance](https://docs.aws.amazon.com/AmazonS3/latest/dev/optimizing-performance.html). + +## Prometheus Categories + +bazel-remote supports declaring categories for the prometheus metrics. +The categories can be used to calculate cache hit ratio separate per +type of build, see where most traffic comes from, etc. + +Clients can set categories via HTTP and gRPC headers. With bazel +that is done via the bazel option --remote_header. + +Allowed category names are declared in the bazel-remote configuration file. +The allowed value set for each category also have to be declared as an +attempt to avoid polluting Prometheus with too many different time series. +https://prometheus.io/docs/practices/naming/ warns: + +> "CAUTION: Remember that every unique combination of key-value +> label pairs represent a new time series, which can dramatically +> increase the amount of data stored. Do not use labels to store +> dimensions with high cardinality (many different label values), +> such as user IDs, email addresses, or other unbounded sets of +> values." + +Received headers that match a declared category name, but with a value outside +the declared allowed value set, is reported with the value "other" to +Prometheus. This is convenient for categories such as "branch", where it from a +cache hit ratio perspective often make sense to distinguish between "main" +branch and "other" branches. + +### Example + +Example from a bazel-remote configuration file that declares categories and +their allowed value sets: +``` +metric_categories: + branch: + - main + user: + - ci + product: + - aaa + - bbb + pipeline: + - unit-test + - integration-test + - system-test + os: + - rhel8 + - rhel9 + - ubuntu21-04 + - ubuntu22-04 +``` + +Bazel clients can be configured to always add flags like: +``` +--remote_header=user=$USER +--remote_header=host=$HOST +--remote_header=os=\`get_os_name.sh\` +``` + +And bazel clients invoked via CI can in addition add headers like: +``` +--remote_header=branch=$BRANCH_NAME +--remote_header=product=$PRODUCT_NAME +--remote_header=pipeline=$CI_PIPELINE_NAME +``` + +The value set for user and branch is not bounded and could therefore not be +stored as is in Prometheus. But in many cases, it is good enough to distinguish +between if it was ci user or non-ci user, or if it was main or non-main branch. +That can be achieved with bazel-remote configuration above which limits the +value ranges to only "ci" and "main". + +Example of prometheus query calculating cache hit ratio specifically for the +user "ci": +``` +(sum (rate(bazel_remote_incoming_requests_total{kind="ac",method="get",user="ci",status="hit"}[$__rate_interval]))) / (sum (rate(bazel_remote_incoming_requests_total{kind="ac",method="get",user="ci"}[$__rate_interval]))) +``` + +Example of prometheus query calculating incoming cache request rate, grouped +by product: +``` +sum by(product) (rate(bazel_remote_incoming_requests_total[$__rate_interval])) +``` diff --git a/cache/disk/BUILD.bazel b/cache/disk/BUILD.bazel index 7250d8e9f..9c4457c9d 100644 --- a/cache/disk/BUILD.bazel +++ b/cache/disk/BUILD.bazel @@ -20,6 +20,7 @@ go_library( "@com_github_djherbis_atime//:go_default_library", "@com_github_prometheus_client_golang//prometheus:go_default_library", "@org_golang_google_grpc//codes:go_default_library", + "@org_golang_google_grpc//metadata:go_default_library", "@org_golang_google_grpc//status:go_default_library", "@org_golang_google_protobuf//proto:go_default_library", "@org_golang_x_sync//semaphore:go_default_library", diff --git a/cache/disk/disk_test.go b/cache/disk/disk_test.go index 60d5b1ff4..ae6b1615e 100644 --- a/cache/disk/disk_test.go +++ b/cache/disk/disk_test.go @@ -1229,7 +1229,7 @@ func TestMetricsUnvalidatedAC(t *testing.T) { testCacheI, err := New(cacheDir, cacheSize, WithAccessLogger(testutils.NewSilentLogger()), - WithEndpointMetrics()) + WithEndpointMetrics(make(map[string][]string))) if err != nil { t.Fatal(err) } @@ -1304,7 +1304,6 @@ func TestMetricsUnvalidatedAC(t *testing.T) { if acHits != 2 { t.Fatalf("Expected acHit counter to be 2, found %f", acHits) } - acMiss = count(testCache.counter, acKind, missStatus) if acMiss != 0 { t.Fatalf("Expected acMiss counter to be 0, found %f", acMiss) @@ -1339,7 +1338,7 @@ func TestMetricsValidatedAC(t *testing.T) { testCacheI, err := New(cacheDir, cacheSize, WithAccessLogger(testutils.NewSilentLogger()), - WithEndpointMetrics()) + WithEndpointMetrics(make(map[string][]string))) if err != nil { t.Fatal(err) } diff --git a/cache/disk/metrics.go b/cache/disk/metrics.go index e87847434..ee008bae0 100644 --- a/cache/disk/metrics.go +++ b/cache/disk/metrics.go @@ -5,6 +5,8 @@ import ( "io" "github.com/buchgr/bazel-remote/cache" + "google.golang.org/grpc/metadata" + "net/http" pb "github.com/buchgr/bazel-remote/genproto/build/bazel/remote/execution/v2" @@ -14,15 +16,17 @@ import ( type metricsDecorator struct { counter *prometheus.CounterVec *diskCache + categories map[string][]string } const ( - hitStatus = "hit" - missStatus = "miss" + hitStatus = "hit" + missStatus = "miss" + emptyStatus = "" containsMethod = "contains" getMethod = "get" - //putMethod = "put" + putMethod = "put" acKind = "ac" // This must be lowercase to match cache.EntryKind.String() casKind = "cas" @@ -46,6 +50,7 @@ func (m *metricsDecorator) Get(ctx context.Context, kind cache.EntryKind, hash s } else { lbls["status"] = missStatus } + m.addCategoryLabels(ctx, lbls) m.counter.With(lbls).Inc() return rc, size, nil @@ -63,6 +68,7 @@ func (m *metricsDecorator) GetValidatedActionResult(ctx context.Context, hash st } else { lbls["status"] = missStatus } + m.addCategoryLabels(ctx, lbls) m.counter.With(lbls).Inc() return ar, data, err @@ -83,6 +89,7 @@ func (m *metricsDecorator) GetZstd(ctx context.Context, hash string, size int64, } else { lbls["status"] = missStatus } + m.addCategoryLabels(ctx, lbls) m.counter.With(lbls).Inc() return rc, size, nil @@ -97,6 +104,7 @@ func (m *metricsDecorator) Contains(ctx context.Context, kind cache.EntryKind, h } else { lbls["status"] = missStatus } + m.addCategoryLabels(ctx, lbls) m.counter.With(lbls).Inc() return ok, size @@ -118,6 +126,7 @@ func (m *metricsDecorator) FindMissingCasBlobs(ctx context.Context, blobs []*pb. "kind": "cas", "status": hitStatus, } + m.addCategoryLabels(ctx, hitLabels) hits := m.counter.With(hitLabels) missLabels := prometheus.Labels{ @@ -125,6 +134,7 @@ func (m *metricsDecorator) FindMissingCasBlobs(ctx context.Context, blobs []*pb. "kind": "cas", "status": missStatus, } + m.addCategoryLabels(ctx, missLabels) misses := m.counter.With(missLabels) hits.Add(float64(numFound)) @@ -132,3 +142,81 @@ func (m *metricsDecorator) FindMissingCasBlobs(ctx context.Context, blobs []*pb. return digests, nil } + +func (m *metricsDecorator) Put(ctx context.Context, kind cache.EntryKind, hash string, size int64, r io.Reader) error { + err := m.diskCache.Put(ctx, kind, hash, size, r) + if err != nil { + return err + } + + lbls := prometheus.Labels{"method": putMethod, "kind": kind.String(), "status": emptyStatus} + m.addCategoryLabels(ctx, lbls) + m.counter.With(lbls).Inc() + + return nil +} + +// Update prometheus labels based on HTTP and gRPC headers available via the context. +func (m *metricsDecorator) addCategoryLabels(ctx context.Context, labels prometheus.Labels) { + + if len(m.categories) == 0 { + return + } + + httpHeaders := getHttpHeaders(ctx) + var grpcHeaders metadata.MD + if httpHeaders == nil { + grpcHeaders = getGrpcHeaders(ctx) + } + + for categoryNameLowerCase, allowedValues := range m.categories { + // Lower case is canonical for gRPC headers and convention for prometheus. + var headerValue string = "" + if grpcHeaders != nil { + grpcHeaderValues := grpcHeaders[categoryNameLowerCase] + if len(grpcHeaderValues) > 0 { + // Pick the first header with matching name if multiple headers with same name + headerValue = grpcHeaderValues[0] + } + } else if httpHeaders != nil { + headerValue = httpHeaders.Get(categoryNameLowerCase) + } + if len(headerValue) == 0 { + labels[categoryNameLowerCase] = "" + } else if contains(allowedValues, headerValue) { + labels[categoryNameLowerCase] = headerValue + } else { + labels[categoryNameLowerCase] = "other" + } + } +} + +func contains(s []string, e string) bool { + for _, a := range s { + if a == e { + return true + } + } + return false +} + +type httpHeadersContextKey struct{} + +// Creates a context copy with HTTP headers attached. +func ContextWithHttpHeaders(ctx context.Context, headers *http.Header) context.Context { + return context.WithValue(ctx, httpHeadersContextKey{}, headers) +} + +// Retrieves HTTP headers from context. Minimizes type safety issues. +func getHttpHeaders(ctx context.Context) *http.Header { + headers, ok := ctx.Value(httpHeadersContextKey{}).(*http.Header) + if !ok { + return nil + } + return headers +} + +func getGrpcHeaders(ctx context.Context) metadata.MD { + grpcHeaders, _ := metadata.FromIncomingContext(ctx) + return grpcHeaders +} diff --git a/cache/disk/options.go b/cache/disk/options.go index bf103e02f..6c159cb1e 100644 --- a/cache/disk/options.go +++ b/cache/disk/options.go @@ -75,18 +75,23 @@ func WithAccessLogger(logger *log.Logger) Option { } } -func WithEndpointMetrics() Option { +func WithEndpointMetrics(categories map[string][]string) Option { return func(c *CacheConfig) error { if c.metrics != nil { return fmt.Errorf("WithEndpointMetrics specified multiple times") } + labels := []string{"method", "status", "kind"} + for categoryNameLowerCase := range categories { + labels = append(labels, categoryNameLowerCase) + } c.metrics = &metricsDecorator{ counter: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "bazel_remote_incoming_requests_total", Help: "The number of incoming cache requests", }, - []string{"method", "kind", "status"}), + labels), + categories: categories, } return nil diff --git a/config/config.go b/config/config.go index 55e64d02b..4741fd2af 100644 --- a/config/config.go +++ b/config/config.go @@ -56,6 +56,7 @@ type Config struct { DisableGRPCACDepsCheck bool `yaml:"disable_grpc_ac_deps_check"` EnableACKeyInstanceMangling bool `yaml:"enable_ac_key_instance_mangling"` EnableEndpointMetrics bool `yaml:"enable_endpoint_metrics"` + MetricCategories map[string][]string `yaml:"metric_categories"` MetricsDurationBuckets []float64 `yaml:"endpoint_metrics_duration_buckets"` ExperimentalRemoteAssetAPI bool `yaml:"experimental_remote_asset_api"` HTTPReadTimeout time.Duration `yaml:"http_read_timeout"` @@ -351,6 +352,12 @@ func validateConfig(c *Config) error { return errors.New("'access_log_level' must be set to either \"none\" or \"all\"") } + for categoryName := range c.MetricCategories { + if categoryName != strings.ToLower(categoryName) { + return fmt.Errorf("Names in 'metric_categories' must be in lower case: %s", categoryName) + } + } + return nil } diff --git a/config/config_test.go b/config/config_test.go index 9758aff95..d93e358db 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -461,3 +461,38 @@ func TestSocketPathMissing(t *testing.T) { t.Fatal("Expected the error message to mention the missing 'http_address' key/flag") } } + +func TestMetricCategories(t *testing.T) { + yaml := ` +metric_categories: + os: + - rhel8 + - rhel9 + - ubuntu21-04 + branch: + - main +dir: /opt/cache-dir +max_size: 42 +storage_mode: zstd +` + config, err := newFromYaml([]byte(yaml)) + if err != nil { + t.Fatal(err) + } + values, ok := config.MetricCategories["os"] + if !ok { + t.Fatalf("Missing os in config") + } + if !contains(values, "rhel9") { + t.Fatalf("Missing rhel9 in config") + } +} + +func contains(s []string, e string) bool { + for _, a := range s { + if a == e { + return true + } + } + return false +} diff --git a/main.go b/main.go index 2512bad74..e4dd4190c 100644 --- a/main.go +++ b/main.go @@ -95,7 +95,7 @@ func run(ctx *cli.Context) error { opts = append(opts, disk.WithProxyBackend(c.ProxyBackend)) } if c.EnableEndpointMetrics { - opts = append(opts, disk.WithEndpointMetrics()) + opts = append(opts, disk.WithEndpointMetrics(c.MetricCategories)) } diskCache, err := disk.New(c.Dir, int64(c.MaxSize)*1024*1024*1024, opts...) diff --git a/server/http.go b/server/http.go index 4c99b91e7..6a467526e 100644 --- a/server/http.go +++ b/server/http.go @@ -201,6 +201,9 @@ func (h *httpCache) logResponse(code int, r *http.Request) { } func (h *httpCache) CacheHandler(w http.ResponseWriter, r *http.Request) { + + r = r.WithContext(disk.ContextWithHttpHeaders(r.Context(), &r.Header)) + defer r.Body.Close() kind, hash, instance, err := parseRequestURL(r.URL.Path, h.validateAC)