-
Notifications
You must be signed in to change notification settings - Fork 6.2k
metrics: enhance diagnostic capabilities for gRPC network issues #67811
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
0a2b8ef
c33b6e1
c5f3b5b
99f9b84
a6303d8
837cd1c
7974954
de2d8d6
d9bbcac
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,17 +15,26 @@ | |
| package metrics | ||
|
|
||
| import ( | ||
| "context" | ||
| "net" | ||
| "sync" | ||
|
|
||
| "github.com/pingcap/tidb/pkg/dxf/framework/dxfmetric" | ||
| "github.com/pingcap/tidb/pkg/ingestor/ingestmetric" | ||
| metricscommon "github.com/pingcap/tidb/pkg/metrics/common" | ||
| timermetrics "github.com/pingcap/tidb/pkg/timer/metrics" | ||
| "github.com/pingcap/tidb/pkg/util/intest" | ||
| "github.com/pingcap/tidb/pkg/util/logutil" | ||
| "github.com/prometheus/client_golang/prometheus" | ||
| "github.com/prometheus/client_golang/prometheus/collectors" | ||
| tikvmetrics "github.com/tikv/client-go/v2/metrics" | ||
| tikvcollectors "github.com/tikv/client-go/v2/util/collectors" | ||
| "go.uber.org/zap" | ||
| "google.golang.org/grpc" | ||
| "google.golang.org/grpc/channelz/grpc_channelz_v1" | ||
| "google.golang.org/grpc/channelz/service" | ||
| "google.golang.org/grpc/credentials/insecure" | ||
| "google.golang.org/grpc/test/bufconn" | ||
| ) | ||
|
|
||
| var ( | ||
|
|
@@ -394,6 +403,9 @@ func RegisterMetrics() { | |
| // StmtSummary | ||
| prometheus.MustRegister(StmtSummaryWindowRecordCount) | ||
| prometheus.MustRegister(StmtSummaryWindowEvictedCount) | ||
|
|
||
| // Channelz | ||
| setupChannelzCollector() | ||
| } | ||
|
|
||
| // Register registers custom collectors. | ||
|
|
@@ -458,3 +470,144 @@ func ToggleSimplifiedMode(simplified bool) { | |
| } | ||
| } | ||
| } | ||
|
|
||
| var grpcChannelzCollector struct { | ||
| mu sync.Mutex | ||
|
|
||
| listener *bufconn.Listener | ||
| server *grpc.Server | ||
| conn *grpc.ClientConn | ||
|
|
||
| collector prometheus.Collector | ||
| registered bool | ||
| } | ||
|
|
||
| func setupChannelzCollector() { | ||
| if intest.InTest { | ||
| return | ||
| } | ||
|
|
||
| grpcChannelzCollector.mu.Lock() | ||
| defer grpcChannelzCollector.mu.Unlock() | ||
|
|
||
| if err := initGrpcChannelzCollectorLocked(); err != nil { | ||
| logutil.BgLogger().Warn("setup internal channelz collector failed", zap.Error(err)) | ||
| return | ||
| } | ||
| if grpcChannelzCollector.registered { | ||
| return | ||
| } | ||
| prometheus.MustRegister(grpcChannelzCollector.collector) | ||
| grpcChannelzCollector.registered = true | ||
| } | ||
|
|
||
| // initGrpcChannelzCollectorLocked initializes the singleton channelz collector. | ||
| // It must be called with grpcChannelzCollector.mu held. | ||
| func initGrpcChannelzCollectorLocked() error { | ||
| if grpcChannelzCollector.collector != nil { | ||
| return nil | ||
| } | ||
|
|
||
| grpcChannelzCollector.listener = bufconn.Listen(1 << 20) | ||
| grpcChannelzCollector.server = grpc.NewServer() | ||
| service.RegisterChannelzServiceToServer(grpcChannelzCollector.server) | ||
| go func(listener *bufconn.Listener, server *grpc.Server) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is graceful shutdown of tidb needs to be considered here to close this background thread properly?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so, it's just for collecting channelz data and won't block graceful shutdown. |
||
| if err := server.Serve(listener); err != nil { | ||
| logutil.BgLogger().Warn("internal channelz grpc server stopped", zap.Error(err)) | ||
| } | ||
| }(grpcChannelzCollector.listener, grpcChannelzCollector.server) | ||
|
|
||
| listener := grpcChannelzCollector.listener | ||
| conn, err := grpc.NewClient( | ||
| "passthrough:///bufnet", | ||
| grpc.WithTransportCredentials(insecure.NewCredentials()), | ||
| grpc.WithContextDialer(func(ctx context.Context, _ string) (net.Conn, error) { | ||
| return listener.DialContext(ctx) | ||
| }), | ||
| ) | ||
| if err != nil { | ||
| stopGrpcChannelzCollectorLocked() | ||
| return err | ||
| } | ||
|
|
||
| grpcChannelzCollector.conn = conn | ||
| grpcChannelzCollector.collector = tikvcollectors.NewChannelzCollector(conn, channelzCollectorOpts()) | ||
| return nil | ||
| } | ||
|
|
||
| func channelzCollectorOpts() tikvcollectors.ChannelzCollectorOpts { | ||
| return tikvcollectors.ChannelzCollectorOpts{ | ||
| Namespace: namespace, | ||
| Filter: func(node any) (collect bool, walkChildren bool) { | ||
|
Comment on lines
+538
to
+541
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This filter seems to include the collector’s own internal bufnet connection, so scraping
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. resolved by c5f3b5b |
||
| // Only collect socket and leaf subchannel info, which are more useful for troubleshooting network issues. | ||
| switch n := node.(type) { | ||
| case *grpc_channelz_v1.Channel: | ||
| if isInternalChannelzTarget(n.GetData().GetTarget()) { | ||
| return false, false | ||
| } | ||
| return false, true | ||
|
|
||
| case *grpc_channelz_v1.Subchannel: | ||
| if isInternalChannelzTarget(n.GetData().GetTarget()) { | ||
| return false, false | ||
| } | ||
| isLeaf := len(n.GetSocketRef()) > 0 && | ||
| len(n.GetChannelRef()) == 0 && | ||
| len(n.GetSubchannelRef()) == 0 | ||
|
|
||
| return isLeaf, true | ||
|
|
||
| case *grpc_channelz_v1.Socket: | ||
| if isInternalChannelzSocket(n) { | ||
| return false, false | ||
| } | ||
|
Comment on lines
+561
to
+563
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this check necessary? Since the parent channel is already filtered by AI suggests that it it is kept,
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the leaf socket nodes should never be visited, and I think it is reasonable to filter out those sockets in transient states, otherwise, we may still write related PromQLs like |
||
| return true, false | ||
|
|
||
| default: | ||
| return false, true | ||
| } | ||
| }, | ||
| } | ||
| } | ||
|
|
||
| // isInternalChannelzTarget returns true if the target is used for internal channelz collector, which is identified by | ||
| // the fact that its target is "bufnet" or "passthrough:///bufnet". | ||
| func isInternalChannelzTarget(target string) bool { | ||
| return target == "bufnet" || target == "passthrough:///bufnet" | ||
| } | ||
|
|
||
| // isInternalChannelzSocket returns true if the socket is created by the internal channelz collector for scrapping | ||
| // channelz metrics, which is identified by the fact that it has no remote endpoint. | ||
| func isInternalChannelzSocket(socket *grpc_channelz_v1.Socket) bool { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better adding comments to explain the meaning of |
||
| return socket.GetRemote() == nil && socket.GetRemoteName() == "" | ||
| } | ||
|
|
||
| func cleanupGrpcChannelzCollectorForTest() { | ||
| grpcChannelzCollector.mu.Lock() | ||
| defer grpcChannelzCollector.mu.Unlock() | ||
|
|
||
| stopGrpcChannelzCollectorLocked() | ||
| } | ||
|
|
||
| // stopGrpcChannelzCollectorLocked stops and resets the singleton channelz collector. | ||
| // It must be called with grpcChannelzCollector.mu held. | ||
| func stopGrpcChannelzCollectorLocked() { | ||
| if grpcChannelzCollector.registered && grpcChannelzCollector.collector != nil { | ||
| prometheus.Unregister(grpcChannelzCollector.collector) | ||
| } | ||
| if grpcChannelzCollector.conn != nil { | ||
| _ = grpcChannelzCollector.conn.Close() | ||
| } | ||
| if grpcChannelzCollector.server != nil { | ||
| grpcChannelzCollector.server.Stop() | ||
| } | ||
| if grpcChannelzCollector.listener != nil { | ||
| _ = grpcChannelzCollector.listener.Close() | ||
| } | ||
|
|
||
| grpcChannelzCollector.server = nil | ||
| grpcChannelzCollector.listener = nil | ||
| grpcChannelzCollector.conn = nil | ||
| grpcChannelzCollector.collector = nil | ||
| grpcChannelzCollector.registered = false | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would other goleak-based suites calling
metrics.RegisterMetrics()directly get go leak check error in some caces?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
resolved by c33b6e1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In case some of integration tests which call
RegisterMetricsbut are NOT compiled with-tags=intest, a6303d8 added the goroutine to the goleak whitelist.