Update broker metrics to count matches, denials, and timeouts

Our metrics were undercounting client polls by missing the case where
clients are matched with a snowflake but receive a timeout before the
snowflake responds with its answer. This change adds a new metric,
called client-snowflake-timeout-count, to the 24 hour broker stats and a
new "timeout" status label for prometheus metrics.
This commit is contained in:
Cecylia Bocovich 2025-03-06 13:45:07 -05:00
parent 583178f4f2
commit 57dc276e48
No known key found for this signature in database
GPG key ID: 009DE379FD9B7B90
4 changed files with 27 additions and 10 deletions

View file

@ -199,7 +199,7 @@ func (i *IPC) ClientOffers(arg messages.Arg, response *[]byte) error {
snowflake.offerChannel <- offer snowflake.offerChannel <- offer
} else { } else {
i.ctx.metrics.lock.Lock() i.ctx.metrics.lock.Lock()
i.ctx.metrics.UpdateRendezvousStats(arg.RemoteAddr, arg.RendezvousMethod, offer.natType, false) i.ctx.metrics.UpdateRendezvousStats(arg.RemoteAddr, arg.RendezvousMethod, offer.natType, "denied")
i.ctx.metrics.lock.Unlock() i.ctx.metrics.lock.Unlock()
resp := &messages.ClientPollResponse{Error: messages.StrNoProxies} resp := &messages.ClientPollResponse{Error: messages.StrNoProxies}
return sendClientResponse(resp, response) return sendClientResponse(resp, response)
@ -209,7 +209,7 @@ func (i *IPC) ClientOffers(arg messages.Arg, response *[]byte) error {
select { select {
case answer := <-snowflake.answerChannel: case answer := <-snowflake.answerChannel:
i.ctx.metrics.lock.Lock() i.ctx.metrics.lock.Lock()
i.ctx.metrics.UpdateRendezvousStats(arg.RemoteAddr, arg.RendezvousMethod, offer.natType, true) i.ctx.metrics.UpdateRendezvousStats(arg.RemoteAddr, arg.RendezvousMethod, offer.natType, "matched")
i.ctx.metrics.lock.Unlock() i.ctx.metrics.lock.Unlock()
resp := &messages.ClientPollResponse{Answer: answer} resp := &messages.ClientPollResponse{Answer: answer}
err = sendClientResponse(resp, response) err = sendClientResponse(resp, response)
@ -218,7 +218,9 @@ func (i *IPC) ClientOffers(arg messages.Arg, response *[]byte) error {
i.ctx.metrics.clientRoundtripEstimate = time.Since(startTime) / time.Millisecond i.ctx.metrics.clientRoundtripEstimate = time.Since(startTime) / time.Millisecond
i.ctx.metrics.lock.Unlock() i.ctx.metrics.lock.Unlock()
case <-time.After(time.Second * ClientTimeout): case <-time.After(time.Second * ClientTimeout):
log.Println("Client: Timed out.") i.ctx.metrics.lock.Lock()
i.ctx.metrics.UpdateRendezvousStats(arg.RemoteAddr, arg.RendezvousMethod, offer.natType, "timeout")
i.ctx.metrics.lock.Unlock()
resp := &messages.ClientPollResponse{Error: messages.StrTimedOut} resp := &messages.ClientPollResponse{Error: messages.StrTimedOut}
err = sendClientResponse(resp, response) err = sendClientResponse(resp, response)
} }
@ -261,7 +263,6 @@ func (i *IPC) ProxyAnswers(arg messages.Arg, response *[]byte) error {
// The snowflake took too long to respond with an answer, so its client // The snowflake took too long to respond with an answer, so its client
// disappeared / the snowflake is no longer recognized by the Broker. // disappeared / the snowflake is no longer recognized by the Broker.
success = false success = false
log.Printf("Warning: matching with snowflake client failed")
} }
b, err := messages.EncodeAnswerResponse(success) b, err := messages.EncodeAnswerResponse(success)

View file

@ -55,6 +55,7 @@ type Metrics struct {
clientRestrictedDeniedCount map[messages.RendezvousMethod]uint clientRestrictedDeniedCount map[messages.RendezvousMethod]uint
clientUnrestrictedDeniedCount map[messages.RendezvousMethod]uint clientUnrestrictedDeniedCount map[messages.RendezvousMethod]uint
clientProxyMatchCount map[messages.RendezvousMethod]uint clientProxyMatchCount map[messages.RendezvousMethod]uint
clientProxyTimeoutCount map[messages.RendezvousMethod]uint
rendezvousCountryStats map[messages.RendezvousMethod]map[string]int rendezvousCountryStats map[messages.RendezvousMethod]map[string]int
@ -147,7 +148,7 @@ func (m *Metrics) UpdateCountryStats(addr string, proxyType string, natType stri
} }
} }
func (m *Metrics) UpdateRendezvousStats(addr string, rendezvousMethod messages.RendezvousMethod, natType string, matched bool) { func (m *Metrics) UpdateRendezvousStats(addr string, rendezvousMethod messages.RendezvousMethod, natType, status string) {
ip := net.ParseIP(addr) ip := net.ParseIP(addr)
country := "??" country := "??"
if m.geoipdb != nil { if m.geoipdb != nil {
@ -157,18 +158,20 @@ func (m *Metrics) UpdateRendezvousStats(addr string, rendezvousMethod messages.R
} }
} }
var status string switch status {
if !matched { case "denied":
m.clientDeniedCount[rendezvousMethod]++ m.clientDeniedCount[rendezvousMethod]++
if natType == NATUnrestricted { if natType == NATUnrestricted {
m.clientUnrestrictedDeniedCount[rendezvousMethod]++ m.clientUnrestrictedDeniedCount[rendezvousMethod]++
} else { } else {
m.clientRestrictedDeniedCount[rendezvousMethod]++ m.clientRestrictedDeniedCount[rendezvousMethod]++
} }
status = "denied" case "matched":
} else {
status = "matched"
m.clientProxyMatchCount[rendezvousMethod]++ m.clientProxyMatchCount[rendezvousMethod]++
case "timeout":
m.clientProxyTimeoutCount[rendezvousMethod]++
default:
log.Printf("Unknown rendezvous status: %s", status)
} }
m.rendezvousCountryStats[rendezvousMethod][country]++ m.rendezvousCountryStats[rendezvousMethod][country]++
m.promMetrics.ClientPollTotal.With(prometheus.Labels{ m.promMetrics.ClientPollTotal.With(prometheus.Labels{
@ -216,6 +219,7 @@ func NewMetrics(metricsLogger *log.Logger) (*Metrics, error) {
m.clientRestrictedDeniedCount = make(map[messages.RendezvousMethod]uint) m.clientRestrictedDeniedCount = make(map[messages.RendezvousMethod]uint)
m.clientUnrestrictedDeniedCount = make(map[messages.RendezvousMethod]uint) m.clientUnrestrictedDeniedCount = make(map[messages.RendezvousMethod]uint)
m.clientProxyMatchCount = make(map[messages.RendezvousMethod]uint) m.clientProxyMatchCount = make(map[messages.RendezvousMethod]uint)
m.clientProxyTimeoutCount = make(map[messages.RendezvousMethod]uint)
m.rendezvousCountryStats = make(map[messages.RendezvousMethod]map[string]int) m.rendezvousCountryStats = make(map[messages.RendezvousMethod]map[string]int)
for _, rendezvousMethod := range rendezvoudMethodList { for _, rendezvousMethod := range rendezvoudMethodList {
@ -275,6 +279,7 @@ func (m *Metrics) printMetrics() {
m.logger.Println("client-restricted-denied-count", binCount(sumMapValues(&m.clientRestrictedDeniedCount))) m.logger.Println("client-restricted-denied-count", binCount(sumMapValues(&m.clientRestrictedDeniedCount)))
m.logger.Println("client-unrestricted-denied-count", binCount(sumMapValues(&m.clientUnrestrictedDeniedCount))) m.logger.Println("client-unrestricted-denied-count", binCount(sumMapValues(&m.clientUnrestrictedDeniedCount)))
m.logger.Println("client-snowflake-match-count", binCount(sumMapValues(&m.clientProxyMatchCount))) m.logger.Println("client-snowflake-match-count", binCount(sumMapValues(&m.clientProxyMatchCount)))
m.logger.Println("client-snowflake-timeout-count", binCount(sumMapValues(&m.clientProxyTimeoutCount)))
for _, rendezvousMethod := range rendezvoudMethodList { for _, rendezvousMethod := range rendezvoudMethodList {
m.logger.Printf("client-%s-count %d\n", rendezvousMethod, binCount( m.logger.Printf("client-%s-count %d\n", rendezvousMethod, binCount(
@ -299,6 +304,7 @@ func (m *Metrics) zeroMetrics() {
m.proxyPollWithRelayURLExtension = 0 m.proxyPollWithRelayURLExtension = 0
m.proxyPollWithoutRelayURLExtension = 0 m.proxyPollWithoutRelayURLExtension = 0
m.clientProxyMatchCount = make(map[messages.RendezvousMethod]uint) m.clientProxyMatchCount = make(map[messages.RendezvousMethod]uint)
m.clientProxyTimeoutCount = make(map[messages.RendezvousMethod]uint)
m.rendezvousCountryStats = make(map[messages.RendezvousMethod]map[string]int) m.rendezvousCountryStats = make(map[messages.RendezvousMethod]map[string]int)
for _, rendezvousMethod := range rendezvoudMethodList { for _, rendezvousMethod := range rendezvoudMethodList {

View file

@ -148,6 +148,7 @@ func TestBroker(t *testing.T) {
client-restricted-denied-count 8 client-restricted-denied-count 8
client-unrestricted-denied-count 0 client-unrestricted-denied-count 0
client-snowflake-match-count 0 client-snowflake-match-count 0
client-snowflake-timeout-count 0
client-http-count 8 client-http-count 8
client-http-ips ??=8 client-http-ips ??=8
client-ampcache-count 0 client-ampcache-count 0
@ -178,6 +179,7 @@ client-sqs-ips
client-restricted-denied-count 0 client-restricted-denied-count 0
client-unrestricted-denied-count 0 client-unrestricted-denied-count 0
client-snowflake-match-count 8 client-snowflake-match-count 8
client-snowflake-timeout-count 0
client-http-count 8 client-http-count 8
client-http-ips ??=8 client-http-ips ??=8
client-ampcache-count 0 client-ampcache-count 0
@ -249,6 +251,7 @@ client-sqs-ips
client-restricted-denied-count 8 client-restricted-denied-count 8
client-unrestricted-denied-count 0 client-unrestricted-denied-count 0
client-snowflake-match-count 0 client-snowflake-match-count 0
client-snowflake-timeout-count 0
client-http-count 8 client-http-count 8
client-http-ips ??=8 client-http-ips ??=8
client-ampcache-count 0 client-ampcache-count 0
@ -279,6 +282,7 @@ client-sqs-ips
client-restricted-denied-count 0 client-restricted-denied-count 0
client-unrestricted-denied-count 0 client-unrestricted-denied-count 0
client-snowflake-match-count 8 client-snowflake-match-count 8
client-snowflake-timeout-count 0
client-http-count 8 client-http-count 8
client-http-ips ??=8 client-http-ips ??=8
client-ampcache-count 0 client-ampcache-count 0
@ -335,6 +339,7 @@ client-sqs-ips
client-restricted-denied-count 8 client-restricted-denied-count 8
client-unrestricted-denied-count 0 client-unrestricted-denied-count 0
client-snowflake-match-count 0 client-snowflake-match-count 0
client-snowflake-timeout-count 0
client-http-count 0 client-http-count 0
client-http-ips client-http-ips
client-ampcache-count 8 client-ampcache-count 8
@ -367,6 +372,7 @@ client-sqs-ips
client-restricted-denied-count 0 client-restricted-denied-count 0
client-unrestricted-denied-count 0 client-unrestricted-denied-count 0
client-snowflake-match-count 8 client-snowflake-match-count 8
client-snowflake-timeout-count 0
client-http-count 0 client-http-count 0
client-http-ips client-http-ips
client-ampcache-count 8 client-ampcache-count 8
@ -729,6 +735,7 @@ client-denied-count 0
client-restricted-denied-count 0 client-restricted-denied-count 0
client-unrestricted-denied-count 0 client-unrestricted-denied-count 0
client-snowflake-match-count 0 client-snowflake-match-count 0
client-snowflake-timeout-count 0
client-http-count 0 client-http-count 0
client-http-ips client-http-ips
client-ampcache-count 0 client-ampcache-count 0
@ -757,6 +764,7 @@ snowflake-ips-nat-unknown 1
client-restricted-denied-count 8 client-restricted-denied-count 8
client-unrestricted-denied-count 0 client-unrestricted-denied-count 0
client-snowflake-match-count 0 client-snowflake-match-count 0
client-snowflake-timeout-count 0
client-http-count 8 client-http-count 8
client-http-ips CA=8 client-http-ips CA=8
client-ampcache-count 0 client-ampcache-count 0
@ -781,6 +789,7 @@ client-denied-count 0
client-restricted-denied-count 0 client-restricted-denied-count 0
client-unrestricted-denied-count 0 client-unrestricted-denied-count 0
client-snowflake-match-count 0 client-snowflake-match-count 0
client-snowflake-timeout-count 0
client-http-count 0 client-http-count 0
client-http-ips client-http-ips
client-ampcache-count 0 client-ampcache-count 0

View file

@ -166,6 +166,7 @@ func TestSQS(t *testing.T) {
client-restricted-denied-count 0 client-restricted-denied-count 0
client-unrestricted-denied-count 0 client-unrestricted-denied-count 0
client-snowflake-match-count 8 client-snowflake-match-count 8
client-snowflake-timeout-count 0
client-http-count 0 client-http-count 0
client-http-ips client-http-ips
client-ampcache-count 0 client-ampcache-count 0