diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 47b99ed..3a71ff9 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -24,3 +24,5 @@ jobs: go-version-file: go.mod - name: golangci-lint uses: golangci/golangci-lint-action@1e7e51e771db61008b38414a730f564565cf7c20 # v9.2.0 + with: + only-new-issues: ${{ github.event_name == 'pull_request' }} diff --git a/.gitignore b/.gitignore index 9fd4deb..ab8defe 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *~ /deployment-tracker +.idea/ diff --git a/README.md b/README.md index ac39a2c..81c6e28 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ deployment records to GitHub's artifact metadata API. - **Real-time tracking**: Sends deployment records when pods are created or deleted - **Graceful shutdown**: Properly drains work queue before terminating +- **Runtime risks**: Track runtime risks through annotations ## How It Works @@ -82,6 +83,13 @@ The `DN_TEMPLATE` supports the following placeholders: - `{{deploymentName}}` - Name of the owning Deployment - `{{containerName}}` - Container name +## Runtime Risks + +You can track runtime risks through annotations. Add the annotation `github.com/runtime-risks`, with a comma-separated list of supported runtime risk values. Annotations are aggregated from the pod and its owner reference objects. + +Currently supported runtime risks can be found in the [Create Deployment Record API docs](https://docs.github.com/en/rest/orgs/artifact-metadata?apiVersion=2022-11-28#create-an-artifact-deployment-record). Invalid runtime risk values will be ignored. + + ## Kubernetes Deployment A complete deployment manifest is provided in `deploy/manifest.yaml` @@ -89,7 +97,7 @@ which includes: - **Namespace**: `deployment-tracker` - **ServiceAccount**: Identity for the controller pod -- **ClusterRole**: Minimal permissions (`get`, `list`, `watch` on pods) +- **ClusterRole**: Minimal permissions (`get`, `list`, `watch` on pods; `get` on other supported objects) - **ClusterRoleBinding**: Binds the ServiceAccount to the ClusterRole - **Deployment**: Runs the controller with security hardening diff --git a/cmd/deployment-tracker/main.go b/cmd/deployment-tracker/main.go index d4fc349..b447e5c 100644 --- a/cmd/deployment-tracker/main.go +++ b/cmd/deployment-tracker/main.go @@ -13,6 +13,7 @@ import ( "time" "github.com/github/deployment-tracker/internal/controller" + "k8s.io/client-go/metadata" "github.com/prometheus/client_golang/prometheus/promhttp" "k8s.io/client-go/kubernetes" @@ -112,6 +113,14 @@ func main() { os.Exit(1) } + // Create metadata client + metadataClient, err := metadata.NewForConfig(k8sCfg) + if err != nil { + slog.Error("Error creating Kubernetes metadata client", + "error", err) + os.Exit(1) + } + // Start the metrics server var promSrv = &http.Server{ Addr: ":" + metricsPort, @@ -151,7 +160,7 @@ func main() { cancel() }() - cntrl, err := controller.New(clientset, namespace, excludeNamespaces, &cntrlCfg) + cntrl, err := controller.New(clientset, metadataClient, namespace, excludeNamespaces, &cntrlCfg) if err != nil { slog.Error("Failed to create controller", "error", err) diff --git a/deploy/manifest.yaml b/deploy/manifest.yaml index cd86ce9..f5b8e72 100644 --- a/deploy/manifest.yaml +++ b/deploy/manifest.yaml @@ -20,6 +20,9 @@ rules: - apiGroups: ["apps"] resources: ["deployments"] verbs: ["get"] + - apiGroups: ["apps"] + resources: ["replicasets"] + verbs: ["get"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -66,7 +69,7 @@ spec: - name: PHYSICAL_ENVIRONMENT value: "iad-moda1" - name: CLUSTER - value: "kommendorkapten" + value: "test-cluster" - name: BASE_URL value: "http://artifact-registry.artifact-registry.svc.cluster.local:9090" resources: diff --git a/internal/controller/controller.go b/internal/controller/controller.go index 2319110..9ac951b 100644 --- a/internal/controller/controller.go +++ b/internal/controller/controller.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "log/slog" + "slices" "strings" "sync" "time" @@ -12,6 +13,9 @@ import ( "github.com/github/deployment-tracker/pkg/deploymentrecord" "github.com/github/deployment-tracker/pkg/image" "github.com/github/deployment-tracker/pkg/metrics" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/metadata" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -29,6 +33,8 @@ const ( EventCreated = "CREATED" // EventDeleted indicates that a pod has been deleted. EventDeleted = "DELETED" + // RuntimeRiskAnnotationKey represents the annotation key for runtime risks. + RuntimeRiskAnnotationKey = "github.com/runtime-risks" ) // PodEvent represents a pod event to be processed. @@ -38,13 +44,19 @@ type PodEvent struct { DeletedPod *corev1.Pod // Only populated for delete events } +// AggregatePodMetadata represents combined metadata for a pod and its ownership hierarchy. +type AggregatePodMetadata struct { + RuntimeRisks map[deploymentrecord.RuntimeRisk]bool +} + // Controller is the Kubernetes controller for tracking deployments. type Controller struct { - clientset kubernetes.Interface - podInformer cache.SharedIndexInformer - workqueue workqueue.TypedRateLimitingInterface[PodEvent] - apiClient *deploymentrecord.Client - cfg *Config + clientset kubernetes.Interface + metadataClient metadata.Interface + podInformer cache.SharedIndexInformer + workqueue workqueue.TypedRateLimitingInterface[PodEvent] + apiClient *deploymentrecord.Client + cfg *Config // best effort cache to avoid redundant posts // post requests are idempotent, so if this cache fails due to // restarts or other events, nothing will break. @@ -52,7 +64,7 @@ type Controller struct { } // New creates a new deployment tracker controller. -func New(clientset kubernetes.Interface, namespace string, excludeNamespaces string, cfg *Config) (*Controller, error) { +func New(clientset kubernetes.Interface, metadataClient metadata.Interface, namespace string, excludeNamespaces string, cfg *Config) (*Controller, error) { // Create informer factory factory := createInformerFactory(clientset, namespace, excludeNamespaces) @@ -84,11 +96,12 @@ func New(clientset kubernetes.Interface, namespace string, excludeNamespaces str } cntrl := &Controller{ - clientset: clientset, - podInformer: podInformer, - workqueue: queue, - apiClient: apiClient, - cfg: cfg, + clientset: clientset, + metadataClient: metadataClient, + podInformer: podInformer, + workqueue: queue, + apiClient: apiClient, + cfg: cfg, } // Add event handlers to the informer @@ -334,16 +347,26 @@ func (c *Controller) processEvent(ctx context.Context, event PodEvent) error { var lastErr error + // Gather aggregate metadata for adds/updates + var runtimeRisks []deploymentrecord.RuntimeRisk + if status != deploymentrecord.StatusDecommissioned { + aggMetadata := c.aggregateMetadata(ctx, podToPartialMetadata(pod)) + for risk := range aggMetadata.RuntimeRisks { + runtimeRisks = append(runtimeRisks, risk) + } + slices.Sort(runtimeRisks) + } + // Record info for each container in the pod for _, container := range pod.Spec.Containers { - if err := c.recordContainer(ctx, pod, container, status, event.EventType); err != nil { + if err := c.recordContainer(ctx, pod, container, status, event.EventType, runtimeRisks); err != nil { lastErr = err } } // Also record init containers for _, container := range pod.Spec.InitContainers { - if err := c.recordContainer(ctx, pod, container, status, event.EventType); err != nil { + if err := c.recordContainer(ctx, pod, container, status, event.EventType, runtimeRisks); err != nil { lastErr = err } } @@ -371,7 +394,7 @@ func (c *Controller) deploymentExists(ctx context.Context, namespace, name strin } // recordContainer records a single container's deployment info. -func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, container corev1.Container, status, eventType string) error { +func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, container corev1.Container, status, eventType string, runtimeRisks []deploymentrecord.RuntimeRisk) error { dn := getARDeploymentName(pod, container, c.cfg.Template) digest := getContainerDigest(pod, container.Name) @@ -424,6 +447,7 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta c.cfg.Cluster, status, dn, + runtimeRisks, ) if err := c.apiClient.PostOne(ctx, record); err != nil { @@ -457,6 +481,7 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta "name", record.Name, "deployment_name", record.DeploymentName, "status", record.Status, + "runtime_risks", record.RuntimeRisks, "digest", record.Digest, ) @@ -473,6 +498,94 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta return nil } +// aggregateMetadata returns aggregated metadata for a pod and its owners. +func (c *Controller) aggregateMetadata(ctx context.Context, obj *metav1.PartialObjectMetadata) AggregatePodMetadata { + aggMetadata := AggregatePodMetadata{ + RuntimeRisks: make(map[deploymentrecord.RuntimeRisk]bool), + } + queue := []*metav1.PartialObjectMetadata{obj} + visited := make(map[types.UID]bool) + + for len(queue) > 0 { + current := queue[0] + queue = queue[1:] + + if visited[current.GetUID()] { + slog.Warn("Already visited object, skipping to avoid cycles", + "UID", current.GetUID(), + "name", current.GetName(), + ) + continue + } + visited[current.GetUID()] = true + + extractMetadataFromObject(current, &aggMetadata) + c.addOwnersToQueue(ctx, current, &queue) + } + + return aggMetadata +} + +// addOwnersToQueue takes a current object and looks up its owners, adding them to the queue for processing +// to collect their metadata. +func (c *Controller) addOwnersToQueue(ctx context.Context, current *metav1.PartialObjectMetadata, queue *[]*metav1.PartialObjectMetadata) { + ownerRefs := current.GetOwnerReferences() + + for _, owner := range ownerRefs { + ownerObj, err := c.getOwnerMetadata(ctx, current.GetNamespace(), owner) + if err != nil { + slog.Warn("Failed to get owner object for metadata collection", + "namespace", current.GetNamespace(), + "owner_kind", owner.Kind, + "owner_name", owner.Name, + "error", err, + ) + continue + } + + if ownerObj == nil { + continue + } + + *queue = append(*queue, ownerObj) + } +} + +// getOwnerMetadata retrieves partial object metadata for an owner ref. +func (c *Controller) getOwnerMetadata(ctx context.Context, namespace string, owner metav1.OwnerReference) (*metav1.PartialObjectMetadata, error) { + gvr := schema.GroupVersionResource{ + Group: "apps", + Version: "v1", + } + + switch owner.Kind { + case "ReplicaSet": + gvr.Resource = "replicasets" + case "Deployment": + gvr.Resource = "deployments" + default: + slog.Debug("Unsupported owner kind for runtime risk collection", + "kind", owner.Kind, + "name", owner.Name, + ) + return nil, nil + } + + obj, err := c.metadataClient.Resource(gvr).Namespace(namespace).Get(ctx, owner.Name, metav1.GetOptions{}) + if err != nil { + if k8serrors.IsNotFound(err) { + slog.Debug("Owner object not found for metadata collection", + "namespace", namespace, + "owner_kind", owner.Kind, + "owner_name", owner.Name, + ) + return nil, nil + } + return nil, err + } + return obj, nil +} + func getCacheKey(dn, digest string) string { return dn + "||" + digest } @@ -580,3 +693,26 @@ func getDeploymentName(pod *corev1.Pod) string { } return "" } + +// extractMetadataFromObject extracts metadata from an object. +func extractMetadataFromObject(obj *metav1.PartialObjectMetadata, aggMetadata *AggregatePodMetadata) { + annotations := obj.GetAnnotations() + if risks, exists := annotations[RuntimeRiskAnnotationKey]; exists { + for _, risk := range strings.Split(risks, ",") { + r := deploymentrecord.ValidateRuntimeRisk(risk) + if r != "" { + aggMetadata.RuntimeRisks[r] = true + } + } + } +} + +func podToPartialMetadata(pod *corev1.Pod) *metav1.PartialObjectMetadata { + return &metav1.PartialObjectMetadata{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Pod", + }, + ObjectMeta: pod.ObjectMeta, + } +} diff --git a/pkg/deploymentrecord/record.go b/pkg/deploymentrecord/record.go index 842242e..2876936 100644 --- a/pkg/deploymentrecord/record.go +++ b/pkg/deploymentrecord/record.go @@ -1,21 +1,46 @@ package deploymentrecord +import ( + "log/slog" + "strings" +) + // Status constants for deployment records. const ( StatusDeployed = "deployed" StatusDecommissioned = "decommissioned" ) +// RuntimeRisk for deployment records. +type RuntimeRisk string + +// Valid runtime risks. +const ( + CriticalResource RuntimeRisk = "critical-resource" + InternetExposed RuntimeRisk = "internet-exposed" + LateralMovement RuntimeRisk = "lateral-movement" + SensitiveData RuntimeRisk = "sensitive-data" +) + +// Map of valid runtime risks. +var validRuntimeRisks = map[RuntimeRisk]bool{ + CriticalResource: true, + InternetExposed: true, + LateralMovement: true, + SensitiveData: true, +} + // DeploymentRecord represents a deployment event record. type DeploymentRecord struct { - Name string `json:"name"` - Digest string `json:"digest"` - Version string `json:"version,omitempty"` - LogicalEnvironment string `json:"logical_environment"` - PhysicalEnvironment string `json:"physical_environment"` - Cluster string `json:"cluster"` - Status string `json:"status"` - DeploymentName string `json:"deployment_name"` + Name string `json:"name"` + Digest string `json:"digest"` + Version string `json:"version,omitempty"` + LogicalEnvironment string `json:"logical_environment"` + PhysicalEnvironment string `json:"physical_environment"` + Cluster string `json:"cluster"` + Status string `json:"status"` + DeploymentName string `json:"deployment_name"` + RuntimeRisks []RuntimeRisk `json:"runtime_risks,omitempty"` } // NewDeploymentRecord creates a new DeploymentRecord with the given status. @@ -23,7 +48,7 @@ type DeploymentRecord struct { // //nolint:revive func NewDeploymentRecord(name, digest, version, logicalEnv, physicalEnv, - cluster, status, deploymentName string) *DeploymentRecord { + cluster, status, deploymentName string, runtimeRisks []RuntimeRisk) *DeploymentRecord { // Validate status if status != StatusDeployed && status != StatusDecommissioned { status = StatusDeployed // default to deployed if invalid @@ -38,5 +63,17 @@ func NewDeploymentRecord(name, digest, version, logicalEnv, physicalEnv, Cluster: cluster, Status: status, DeploymentName: deploymentName, + RuntimeRisks: runtimeRisks, + } +} + +// ValidateRuntimeRisk confirms if string is a valid runtime risk, +// then returns the canonical runtime risk constant if valid, empty string otherwise. +func ValidateRuntimeRisk(risk string) RuntimeRisk { + r := RuntimeRisk(strings.ToLower(strings.TrimSpace(risk))) + if !validRuntimeRisks[r] { + slog.Debug("Invalid runtime risk", "risk", risk) + return "" } + return r } diff --git a/pkg/deploymentrecord/record_test.go b/pkg/deploymentrecord/record_test.go new file mode 100644 index 0000000..f888e2d --- /dev/null +++ b/pkg/deploymentrecord/record_test.go @@ -0,0 +1,41 @@ +package deploymentrecord + +import "testing" + +func TestValidateRuntimeRisk(t *testing.T) { + tests := []struct { + name string + input string + expected RuntimeRisk + }{ + { + name: "valid runtime risk", + input: "critical-resource", + expected: CriticalResource, + }, + { + name: "valid runtime risk with space", + input: "critical-resource ", + expected: CriticalResource, + }, + { + name: "invalid empty string", + input: "", + expected: "", + }, + { + name: "invalid unknown risk", + input: "unknown-risk", + expected: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := ValidateRuntimeRisk(tt.input) + if result != tt.expected { + t.Errorf("input: %s, actual: %q, wanted: %q", tt.input, result, tt.expected) + } + }) + } +}