diff --git a/README.md b/README.md index 9f66b14..0f86d4e 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,11 @@ The metrics exposed beyond the default Prometheus metrics are: outgoing HTTP POST to upload the deployment record. * `deptracker_post_record_ok`: the number of successful deployment record uploads. +* `deptracker_post_record_rate_limited`: the number of post attempts + that were rate limited. +* `deptracker_post_record_no_attestation`: the number of attempts + that resulted in no matching attestation for the container digest + (404 "no artifacts found" responses). * `deptracker_post_record_soft_fail`: the number of recoverable failed attempts to upload the deployment record. * `deptracker_post_record_hard_fail`: the number of failures to diff --git a/go.mod b/go.mod index 47393b5..e6bac62 100644 --- a/go.mod +++ b/go.mod @@ -32,6 +32,7 @@ require ( github.com/google/uuid v1.6.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect diff --git a/internal/controller/controller.go b/internal/controller/controller.go index 3749e56..4c44952 100644 --- a/internal/controller/controller.go +++ b/internal/controller/controller.go @@ -471,6 +471,12 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta ) if err := c.apiClient.PostOne(ctx, record); err != nil { + // Return if no artifact is found + var noArtifactErr *deploymentrecord.NoArtifactError + if errors.As(err, &noArtifactErr) { + return nil + } + // Make sure to not retry on client error messages var clientErr *deploymentrecord.ClientError if errors.As(err, &clientErr) { diff --git a/pkg/deploymentrecord/client.go b/pkg/deploymentrecord/client.go index fe50b74..fa6286a 100644 --- a/pkg/deploymentrecord/client.go +++ b/pkg/deploymentrecord/client.go @@ -160,6 +160,19 @@ func (c *ClientError) Unwrap() error { return c.err } +// NoArtifactError represents a 404 client response whose body indicates "no artifacts found". +type NoArtifactError struct { + err error +} + +func (n *NoArtifactError) Error() string { + return fmt.Sprintf("no artifact found: %s", n.err.Error()) +} + +func (n *NoArtifactError) Unwrap() error { + return n.err +} + // PostOne posts a single deployment record to the GitHub deployment // records API. func (c *Client) PostOne(ctx context.Context, record *DeploymentRecord) error { @@ -249,34 +262,64 @@ func (c *Client) PostOne(ctx context.Context, record *DeploymentRecord) error { } // Drain and close response body to enable connection reuse by reading body for error logging - body, _ := io.ReadAll(resp.Body) + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + _, _ = io.Copy(io.Discard, resp.Body) _ = resp.Body.Close() - lastErr = fmt.Errorf("unexpected status code: %d", resp.StatusCode) - - // Don't retry on client errors (4xx) except for 429 - // (rate limit) - if resp.StatusCode >= 400 && resp.StatusCode < 500 && resp.StatusCode != 429 { + switch { + case resp.StatusCode == 404: + // No artifact found + dtmetrics.PostDeploymentRecordNoAttestation.Inc() + slog.Debug("no artifact attestation found, no record created", + "attempt", attempt, + "status_code", resp.StatusCode, + "container_name", record.Name, + "resp_msg", string(respBody), + "digest", record.Digest, + ) + return &NoArtifactError{err: fmt.Errorf("no attestation found for %s", record.Digest)} + case resp.StatusCode >= 400 && resp.StatusCode < 500: + if resp.Header.Get("retry-after") != "" || resp.Header.Get("x-ratelimit-remaining") == "0" { + // Rate limited — retry with backoff + // Could be 403 or 429 + dtmetrics.PostDeploymentRecordRateLimited.Inc() + slog.Warn("rate limited, retrying", + "attempt", attempt, + "status_code", resp.StatusCode, + "retry_after", resp.Header.Get("Retry-After"), + "container_name", record.Name, + "resp_msg", string(respBody), + ) + lastErr = fmt.Errorf("rate limited, attempt %d", attempt) + continue + } + // Don't retry non rate limiting client errors dtmetrics.PostDeploymentRecordClientError.Inc() slog.Warn("client error, aborting", "attempt", attempt, - "error", lastErr, "status_code", resp.StatusCode, - "msg", string(body), + "container_name", record.Name, + "resp_msg", string(respBody), + ) + return &ClientError{err: fmt.Errorf("unexpected client err with status code %d", resp.StatusCode)} + default: + // Retry with backoff + dtmetrics.PostDeploymentRecordSoftFail.Inc() + slog.Debug("retriable error", + "attempt", attempt, + "status_code", resp.StatusCode, + "container_name", record.Name, + "resp_msg", string(respBody), ) - return &ClientError{err: lastErr} + lastErr = fmt.Errorf("server error, attempt %d", attempt) } - dtmetrics.PostDeploymentRecordSoftFail.Inc() - slog.Debug("retriable server error", - "attempt", attempt, - "status_code", resp.StatusCode, - "msg", string(body), - ) } dtmetrics.PostDeploymentRecordHardFail.Inc() slog.Error("all retries exhausted", "count", c.retries, - "error", lastErr) + "error", lastErr, + "container_name", record.Name, + ) return fmt.Errorf("all retries exhausted: %w", lastErr) } diff --git a/pkg/deploymentrecord/client_test.go b/pkg/deploymentrecord/client_test.go index 97e4bba..d72e54f 100644 --- a/pkg/deploymentrecord/client_test.go +++ b/pkg/deploymentrecord/client_test.go @@ -1,9 +1,18 @@ package deploymentrecord import ( + "context" + "errors" + "net/http" + "net/http/httptest" "strings" + "sync/atomic" "testing" "time" + + "github.com/github/deployment-tracker/pkg/dtmetrics" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" ) func TestNewClient(t *testing.T) { @@ -264,3 +273,334 @@ func TestValidOrgPattern(t *testing.T) { } } } + +// testRecord returns a minimal valid DeploymentRecord for testing. +func testRecord() *DeploymentRecord { + return NewDeploymentRecord( + "ghcr.io/my-org/my-image", + "sha256:abc123", + "v1.0.0", + "production", + "us-east-1", + "cluster-1", + StatusDeployed, + "my-deployment", + nil, + nil, + ) +} + +// allCounters returns all PostDeploymentRecord counters for snapshotting. +func allCounters() []prometheus.Counter { + return []prometheus.Counter{ + dtmetrics.PostDeploymentRecordOk, + dtmetrics.PostDeploymentRecordNoAttestation, + dtmetrics.PostDeploymentRecordRateLimited, + dtmetrics.PostDeploymentRecordSoftFail, + dtmetrics.PostDeploymentRecordHardFail, + dtmetrics.PostDeploymentRecordClientError, + } +} + +func TestPostOne(t *testing.T) { + tests := []struct { + name string + record *DeploymentRecord + retries int + handler http.HandlerFunc + wantErr bool + errType any // expected error type for errors.As + errContain string + wantOk float64 + wantNoAttestation float64 + wantRateLimited float64 + wantSoftFail float64 + wantHardFail float64 + wantClientError float64 + }{ + { + name: "success on 200", + record: testRecord(), + handler: func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + }, + wantOk: 1, + }, + { + name: "success on 201", + record: testRecord(), + handler: func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusCreated) + }, + wantOk: 1, + }, + { + name: "nil record returns error", + record: nil, + wantErr: true, + handler: func(_ http.ResponseWriter, _ *http.Request) { + t.Fatal("server should not be called with nil record") + }, + errContain: "record cannot be nil", + }, + { + name: "404 with no artifacts found returns NoArtifactError", + record: testRecord(), + handler: func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusNotFound) + _, _ = w.Write([]byte(`{"message":"no artifacts found"}`)) + }, + wantErr: true, + errType: &NoArtifactError{}, + errContain: "sha256:abc123", + wantNoAttestation: 1, + }, + { + name: "400 returns ClientError", + record: testRecord(), + handler: func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadRequest) + _, _ = w.Write([]byte("bad request")) + }, + wantErr: true, + errType: &ClientError{}, + wantClientError: 1, + }, + { + name: "403 forbidden returns ClientError", + record: testRecord(), + handler: func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusForbidden) + _, _ = w.Write([]byte(`{"message":"forbidden"}`)) + }, + wantErr: true, + errType: &ClientError{}, + wantClientError: 1, + }, + { + name: "422 invalid body returns ClientError", + record: testRecord(), + handler: func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusUnprocessableEntity) + _, _ = w.Write([]byte("invalid body")) + }, + wantErr: true, + errType: &ClientError{}, + wantClientError: 1, + }, + { + name: "429 rate limit retries then fails", + record: testRecord(), + retries: 1, + handler: func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Retry-After", "1") + w.WriteHeader(http.StatusTooManyRequests) + }, + wantErr: true, + errContain: "all retries exhausted", + wantRateLimited: 2, + wantHardFail: 1, + }, + { + name: "403 with Retry-After header retries then fails", + record: testRecord(), + retries: 1, + handler: func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Retry-After", "1") + w.WriteHeader(http.StatusForbidden) + _, _ = w.Write([]byte(`{"message":"rate limit"}`)) + }, + wantErr: true, + errContain: "all retries exhausted", + wantRateLimited: 2, + wantHardFail: 1, + }, + { + name: "403 with x-ratelimit-remaining 0 retries then fails", + record: testRecord(), + retries: 1, + handler: func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("X-Ratelimit-Remaining", "0") + w.WriteHeader(http.StatusForbidden) + }, + wantErr: true, + errContain: "all retries exhausted", + wantRateLimited: 2, + wantHardFail: 1, + }, + { + name: "500 server error retries then fails", + record: testRecord(), + retries: 1, + handler: func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + _, _ = w.Write([]byte("internal error")) + }, + wantErr: true, + errContain: "all retries exhausted", + wantSoftFail: 2, + wantHardFail: 1, + }, + { + name: "500 then 200 succeeds on retry", + record: testRecord(), + retries: 2, + handler: func() http.HandlerFunc { + var count atomic.Int32 + return func(w http.ResponseWriter, _ *http.Request) { + if count.Add(1) == 1 { + w.WriteHeader(http.StatusInternalServerError) + return + } + w.WriteHeader(http.StatusOK) + } + }(), + wantSoftFail: 1, + wantOk: 1, + }, + { + name: "429 then 200 succeeds on retry", + record: testRecord(), + retries: 2, + handler: func() http.HandlerFunc { + var count atomic.Int32 + return func(w http.ResponseWriter, _ *http.Request) { + if count.Add(1) == 1 { + w.Header().Set("Retry-After", "0") + w.WriteHeader(http.StatusTooManyRequests) + return + } + w.WriteHeader(http.StatusOK) + } + }(), + wantRateLimited: 1, + wantOk: 1, + }, + { + name: "403 secondary rate limit then 200 succeeds on retry", + record: testRecord(), + retries: 2, + handler: func() http.HandlerFunc { + var count atomic.Int32 + return func(w http.ResponseWriter, _ *http.Request) { + if count.Add(1) == 1 { + w.Header().Set("Retry-After", "0") + w.WriteHeader(http.StatusForbidden) + _, _ = w.Write([]byte(`{"message":"secondary rate limit"}`)) + return + } + w.WriteHeader(http.StatusOK) + } + }(), + wantRateLimited: 1, + wantOk: 1, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + srv := httptest.NewServer(tt.handler) + t.Cleanup(srv.Close) + + client, err := NewClient(srv.URL, "test-org", WithRetries(tt.retries)) + if err != nil { + t.Fatalf("failed to create client: %v", err) + } + + // Snapshot all counters before the call + counters := allCounters() + snapshots := make([]float64, len(counters)) + for i, c := range counters { + snapshots[i] = testutil.ToFloat64(c) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + t.Cleanup(cancel) + + err = client.PostOne(ctx, tt.record) + + if tt.wantErr { + if err == nil { + t.Fatal("expected error, got nil") + } + if tt.errType != nil { + target := tt.errType + switch target.(type) { + case *NoArtifactError: + var e *NoArtifactError + if !errors.As(err, &e) { + t.Errorf("expected NoArtifactError, got %T: %v", err, err) + } + case *ClientError: + var e *ClientError + if !errors.As(err, &e) { + t.Errorf("expected ClientError, got %T: %v", err, err) + } + default: + t.Fatalf("unexpected error type in test: %T", target) + } + } + if tt.errContain != "" && !strings.Contains(err.Error(), tt.errContain) { + t.Errorf("error %q should contain %q", err.Error(), tt.errContain) + } + } else if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Assert all metric deltas + wantDeltas := []float64{ + tt.wantOk, + tt.wantNoAttestation, + tt.wantRateLimited, + tt.wantSoftFail, + tt.wantHardFail, + tt.wantClientError, + } + names := []string{ + "PostDeploymentRecordOk", + "PostDeploymentRecordNoAttestation", + "PostDeploymentRecordRateLimited", + "PostDeploymentRecordSoftFail", + "PostDeploymentRecordHardFail", + "PostDeploymentRecordClientError", + } + for i, c := range counters { + got := testutil.ToFloat64(c) - snapshots[i] + if got != wantDeltas[i] { + t.Errorf("%s delta = %v, want %v", names[i], got, wantDeltas[i]) + } + } + }) + } +} + +func TestPostOneSendsCorrectRequest(t *testing.T) { + record := testRecord() + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + t.Errorf("method = %s, want POST", r.Method) + } + if got := r.URL.Path; got != "/orgs/test-org/artifacts/metadata/deployment-record" { + t.Errorf("path = %s, want /orgs/test-org/artifacts/metadata/deployment-record", got) + } + if got := r.Header.Get("Content-Type"); got != "application/json" { + t.Errorf("Content-Type = %s, want application/json", got) + } + if got := r.Header.Get("Authorization"); got != "Bearer test-token" { + t.Errorf("Authorization = %s, want Bearer test-token", got) + } + w.WriteHeader(http.StatusOK) + })) + t.Cleanup(srv.Close) + + client, err := NewClient(srv.URL, "test-org", + WithAPIToken("test-token")) + if err != nil { + t.Fatalf("failed to create client: %v", err) + } + + if err := client.PostOne(context.Background(), record); err != nil { + t.Fatalf("unexpected error: %v", err) + } +} diff --git a/pkg/dtmetrics/prom.go b/pkg/dtmetrics/prom.go index 8e76ff8..79c01eb 100644 --- a/pkg/dtmetrics/prom.go +++ b/pkg/dtmetrics/prom.go @@ -49,6 +49,22 @@ var ( }, ) + //nolint: revive + PostDeploymentRecordNoAttestation = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "deptracker_post_record_no_attestation", + Help: "The total number of post attempts that resulted in no matching attestation for the container digest (404 'no artifacts found' responses)", + }, + ) + + //nolint: revive + PostDeploymentRecordRateLimited = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "deptracker_post_record_rate_limited", + Help: "The total number of post attempts that were rate limited", + }, + ) + //nolint: revive PostDeploymentRecordSoftFail = promauto.NewCounter( prometheus.CounterOpts{