From 57dc276e48e91c8d9068b20e4578e927f435fb2e Mon Sep 17 00:00:00 2001 From: Cecylia Bocovich Date: Thu, 6 Mar 2025 13:45:07 -0500 Subject: [PATCH] Update broker metrics to count matches, denials, and timeouts Our metrics were undercounting client polls by missing the case where clients are matched with a snowflake but receive a timeout before the snowflake responds with its answer. This change adds a new metric, called client-snowflake-timeout-count, to the 24 hour broker stats and a new "timeout" status label for prometheus metrics. --- broker/ipc.go | 9 +++++---- broker/metrics.go | 18 ++++++++++++------ broker/snowflake-broker_test.go | 9 +++++++++ broker/sqs_test.go | 1 + 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/broker/ipc.go b/broker/ipc.go index f4f7f6b..64cefcd 100644 --- a/broker/ipc.go +++ b/broker/ipc.go @@ -199,7 +199,7 @@ func (i *IPC) ClientOffers(arg messages.Arg, response *[]byte) error { snowflake.offerChannel <- offer } else { i.ctx.metrics.lock.Lock() - i.ctx.metrics.UpdateRendezvousStats(arg.RemoteAddr, arg.RendezvousMethod, offer.natType, false) + i.ctx.metrics.UpdateRendezvousStats(arg.RemoteAddr, arg.RendezvousMethod, offer.natType, "denied") i.ctx.metrics.lock.Unlock() resp := &messages.ClientPollResponse{Error: messages.StrNoProxies} return sendClientResponse(resp, response) @@ -209,7 +209,7 @@ func (i *IPC) ClientOffers(arg messages.Arg, response *[]byte) error { select { case answer := <-snowflake.answerChannel: i.ctx.metrics.lock.Lock() - i.ctx.metrics.UpdateRendezvousStats(arg.RemoteAddr, arg.RendezvousMethod, offer.natType, true) + i.ctx.metrics.UpdateRendezvousStats(arg.RemoteAddr, arg.RendezvousMethod, offer.natType, "matched") i.ctx.metrics.lock.Unlock() resp := &messages.ClientPollResponse{Answer: answer} err = sendClientResponse(resp, response) @@ -218,7 +218,9 @@ func (i *IPC) ClientOffers(arg messages.Arg, response *[]byte) error { i.ctx.metrics.clientRoundtripEstimate = time.Since(startTime) / time.Millisecond i.ctx.metrics.lock.Unlock() case <-time.After(time.Second * ClientTimeout): - log.Println("Client: Timed out.") + i.ctx.metrics.lock.Lock() + i.ctx.metrics.UpdateRendezvousStats(arg.RemoteAddr, arg.RendezvousMethod, offer.natType, "timeout") + i.ctx.metrics.lock.Unlock() resp := &messages.ClientPollResponse{Error: messages.StrTimedOut} err = sendClientResponse(resp, response) } @@ -261,7 +263,6 @@ func (i *IPC) ProxyAnswers(arg messages.Arg, response *[]byte) error { // The snowflake took too long to respond with an answer, so its client // disappeared / the snowflake is no longer recognized by the Broker. success = false - log.Printf("Warning: matching with snowflake client failed") } b, err := messages.EncodeAnswerResponse(success) diff --git a/broker/metrics.go b/broker/metrics.go index 4e85183..53f315b 100644 --- a/broker/metrics.go +++ b/broker/metrics.go @@ -55,6 +55,7 @@ type Metrics struct { clientRestrictedDeniedCount map[messages.RendezvousMethod]uint clientUnrestrictedDeniedCount map[messages.RendezvousMethod]uint clientProxyMatchCount map[messages.RendezvousMethod]uint + clientProxyTimeoutCount map[messages.RendezvousMethod]uint rendezvousCountryStats map[messages.RendezvousMethod]map[string]int @@ -147,7 +148,7 @@ func (m *Metrics) UpdateCountryStats(addr string, proxyType string, natType stri } } -func (m *Metrics) UpdateRendezvousStats(addr string, rendezvousMethod messages.RendezvousMethod, natType string, matched bool) { +func (m *Metrics) UpdateRendezvousStats(addr string, rendezvousMethod messages.RendezvousMethod, natType, status string) { ip := net.ParseIP(addr) country := "??" if m.geoipdb != nil { @@ -157,18 +158,20 @@ func (m *Metrics) UpdateRendezvousStats(addr string, rendezvousMethod messages.R } } - var status string - if !matched { + switch status { + case "denied": m.clientDeniedCount[rendezvousMethod]++ if natType == NATUnrestricted { m.clientUnrestrictedDeniedCount[rendezvousMethod]++ } else { m.clientRestrictedDeniedCount[rendezvousMethod]++ } - status = "denied" - } else { - status = "matched" + case "matched": m.clientProxyMatchCount[rendezvousMethod]++ + case "timeout": + m.clientProxyTimeoutCount[rendezvousMethod]++ + default: + log.Printf("Unknown rendezvous status: %s", status) } m.rendezvousCountryStats[rendezvousMethod][country]++ m.promMetrics.ClientPollTotal.With(prometheus.Labels{ @@ -216,6 +219,7 @@ func NewMetrics(metricsLogger *log.Logger) (*Metrics, error) { m.clientRestrictedDeniedCount = make(map[messages.RendezvousMethod]uint) m.clientUnrestrictedDeniedCount = make(map[messages.RendezvousMethod]uint) m.clientProxyMatchCount = make(map[messages.RendezvousMethod]uint) + m.clientProxyTimeoutCount = make(map[messages.RendezvousMethod]uint) m.rendezvousCountryStats = make(map[messages.RendezvousMethod]map[string]int) for _, rendezvousMethod := range rendezvoudMethodList { @@ -275,6 +279,7 @@ func (m *Metrics) printMetrics() { m.logger.Println("client-restricted-denied-count", binCount(sumMapValues(&m.clientRestrictedDeniedCount))) m.logger.Println("client-unrestricted-denied-count", binCount(sumMapValues(&m.clientUnrestrictedDeniedCount))) m.logger.Println("client-snowflake-match-count", binCount(sumMapValues(&m.clientProxyMatchCount))) + m.logger.Println("client-snowflake-timeout-count", binCount(sumMapValues(&m.clientProxyTimeoutCount))) for _, rendezvousMethod := range rendezvoudMethodList { m.logger.Printf("client-%s-count %d\n", rendezvousMethod, binCount( @@ -299,6 +304,7 @@ func (m *Metrics) zeroMetrics() { m.proxyPollWithRelayURLExtension = 0 m.proxyPollWithoutRelayURLExtension = 0 m.clientProxyMatchCount = make(map[messages.RendezvousMethod]uint) + m.clientProxyTimeoutCount = make(map[messages.RendezvousMethod]uint) m.rendezvousCountryStats = make(map[messages.RendezvousMethod]map[string]int) for _, rendezvousMethod := range rendezvoudMethodList { diff --git a/broker/snowflake-broker_test.go b/broker/snowflake-broker_test.go index f8a45b9..2f6f9e1 100644 --- a/broker/snowflake-broker_test.go +++ b/broker/snowflake-broker_test.go @@ -148,6 +148,7 @@ func TestBroker(t *testing.T) { client-restricted-denied-count 8 client-unrestricted-denied-count 0 client-snowflake-match-count 0 +client-snowflake-timeout-count 0 client-http-count 8 client-http-ips ??=8 client-ampcache-count 0 @@ -178,6 +179,7 @@ client-sqs-ips client-restricted-denied-count 0 client-unrestricted-denied-count 0 client-snowflake-match-count 8 +client-snowflake-timeout-count 0 client-http-count 8 client-http-ips ??=8 client-ampcache-count 0 @@ -249,6 +251,7 @@ client-sqs-ips client-restricted-denied-count 8 client-unrestricted-denied-count 0 client-snowflake-match-count 0 +client-snowflake-timeout-count 0 client-http-count 8 client-http-ips ??=8 client-ampcache-count 0 @@ -279,6 +282,7 @@ client-sqs-ips client-restricted-denied-count 0 client-unrestricted-denied-count 0 client-snowflake-match-count 8 +client-snowflake-timeout-count 0 client-http-count 8 client-http-ips ??=8 client-ampcache-count 0 @@ -335,6 +339,7 @@ client-sqs-ips client-restricted-denied-count 8 client-unrestricted-denied-count 0 client-snowflake-match-count 0 +client-snowflake-timeout-count 0 client-http-count 0 client-http-ips client-ampcache-count 8 @@ -367,6 +372,7 @@ client-sqs-ips client-restricted-denied-count 0 client-unrestricted-denied-count 0 client-snowflake-match-count 8 +client-snowflake-timeout-count 0 client-http-count 0 client-http-ips client-ampcache-count 8 @@ -729,6 +735,7 @@ client-denied-count 0 client-restricted-denied-count 0 client-unrestricted-denied-count 0 client-snowflake-match-count 0 +client-snowflake-timeout-count 0 client-http-count 0 client-http-ips client-ampcache-count 0 @@ -757,6 +764,7 @@ snowflake-ips-nat-unknown 1 client-restricted-denied-count 8 client-unrestricted-denied-count 0 client-snowflake-match-count 0 +client-snowflake-timeout-count 0 client-http-count 8 client-http-ips CA=8 client-ampcache-count 0 @@ -781,6 +789,7 @@ client-denied-count 0 client-restricted-denied-count 0 client-unrestricted-denied-count 0 client-snowflake-match-count 0 +client-snowflake-timeout-count 0 client-http-count 0 client-http-ips client-ampcache-count 0 diff --git a/broker/sqs_test.go b/broker/sqs_test.go index 5b0feec..33e38f1 100644 --- a/broker/sqs_test.go +++ b/broker/sqs_test.go @@ -166,6 +166,7 @@ func TestSQS(t *testing.T) { client-restricted-denied-count 0 client-unrestricted-denied-count 0 client-snowflake-match-count 8 +client-snowflake-timeout-count 0 client-http-count 0 client-http-ips client-ampcache-count 0