diff --git a/cmd/internal/storage/influxdb/influxdb.go b/cmd/internal/storage/influxdb/influxdb.go index 5b2bd175..f23630d5 100644 --- a/cmd/internal/storage/influxdb/influxdb.go +++ b/cmd/internal/storage/influxdb/influxdb.go @@ -58,8 +58,22 @@ const ( serLoadAverage string = "load_average" // Memory Usage serMemoryUsage string = "memory_usage" + // Maximum memory usage recorded + serMemoryMaxUsage string = "memory_max_usage" + // //Number of bytes of page cache memory + serMemoryCache string = "memory_cache" + // Size of RSS + serMemoryRss string = "memory_rss" + // Container swap usage + serMemorySwap string = "memory_swap" + // Size of memory mapped files in bytes + serMemoryMappedFile string = "memory_mapped_file" // Working set size serMemoryWorkingSet string = "memory_working_set" + // Number of memory usage hits limits + serMemoryFailcnt string = "memory_failcnt" + // Cumulative count of memory allocation failures + serMemoryFailure string = "memory_failure" // Cumulative count of bytes received. serRxBytes string = "rx_bytes" // Cumulative count of receive errors encountered. @@ -72,6 +86,22 @@ const ( serFsLimit string = "fs_limit" // Filesystem usage. serFsUsage string = "fs_usage" + // Hugetlb stat - current res_counter usage for hugetlb + setHugetlbUsage = "hugetlb_usage" + // Hugetlb stat - maximum usage ever recorded + setHugetlbMaxUsage = "hugetlb_max_usage" + // Hugetlb stat - number of times hugetlb usage allocation failure + setHugetlbFailcnt = "hugetlb_failcnt" + // Perf statistics + serPerfStat = "perf_stat" + // Referenced memory + serReferencedMemory = "referenced_memory" + // Resctrl - Total memory bandwidth + serResctrlMemoryBandwidthTotal = "resctrl_memory_bandwidth_total" + // Resctrl - Local memory bandwidth + serResctrlMemoryBandwidthLocal = "resctrl_memory_bandwidth_local" + // Resctrl - Last level cache usage + serResctrlLLCOccupancy = "resctrl_llc_occupancy" ) func new() (storage.StorageDriver, error) { @@ -194,18 +224,152 @@ func (s *influxdbStorage) containerStatsToPoints( // Load Average points = append(points, makePoint(serLoadAverage, stats.Cpu.LoadAverage)) - // Memory Usage - points = append(points, makePoint(serMemoryUsage, stats.Memory.Usage)) - - // Working Set Size - points = append(points, makePoint(serMemoryWorkingSet, stats.Memory.WorkingSet)) - // Network Stats points = append(points, makePoint(serRxBytes, stats.Network.RxBytes)) points = append(points, makePoint(serRxErrors, stats.Network.RxErrors)) points = append(points, makePoint(serTxBytes, stats.Network.TxBytes)) points = append(points, makePoint(serTxErrors, stats.Network.TxErrors)) + // Referenced Memory + points = append(points, makePoint(serReferencedMemory, stats.ReferencedMemory)) + + s.tagPoints(cInfo, stats, points) + + return points +} + +func (s *influxdbStorage) memoryStatsToPoints( + cInfo *info.ContainerInfo, + stats *info.ContainerStats, +) (points []*influxdb.Point) { + // Memory Usage + points = append(points, makePoint(serMemoryUsage, stats.Memory.Usage)) + // Maximum memory usage recorded + points = append(points, makePoint(serMemoryMaxUsage, stats.Memory.MaxUsage)) + //Number of bytes of page cache memory + points = append(points, makePoint(serMemoryCache, stats.Memory.Cache)) + // Size of RSS + points = append(points, makePoint(serMemoryRss, stats.Memory.RSS)) + // Container swap usage + points = append(points, makePoint(serMemorySwap, stats.Memory.Swap)) + // Size of memory mapped files in bytes + points = append(points, makePoint(serMemoryMappedFile, stats.Memory.MappedFile)) + // Working Set Size + points = append(points, makePoint(serMemoryWorkingSet, stats.Memory.WorkingSet)) + // Number of memory usage hits limits + points = append(points, makePoint(serMemoryFailcnt, stats.Memory.Failcnt)) + + // Cumulative count of memory allocation failures + memoryFailuresTags := map[string]string{ + "failure_type": "pgfault", + "scope": "container", + } + memoryFailurePoint := makePoint(serMemoryFailure, stats.Memory.ContainerData.Pgfault) + addTagsToPoint(memoryFailurePoint, memoryFailuresTags) + points = append(points, memoryFailurePoint) + + memoryFailuresTags["failure_type"] = "pgmajfault" + memoryFailurePoint = makePoint(serMemoryFailure, stats.Memory.ContainerData.Pgmajfault) + addTagsToPoint(memoryFailurePoint, memoryFailuresTags) + points = append(points, memoryFailurePoint) + + memoryFailuresTags["failure_type"] = "pgfault" + memoryFailuresTags["scope"] = "hierarchical" + memoryFailurePoint = makePoint(serMemoryFailure, stats.Memory.HierarchicalData.Pgfault) + addTagsToPoint(memoryFailurePoint, memoryFailuresTags) + points = append(points, memoryFailurePoint) + + memoryFailuresTags["failure_type"] = "pgmajfault" + memoryFailurePoint = makePoint(serMemoryFailure, stats.Memory.HierarchicalData.Pgmajfault) + addTagsToPoint(memoryFailurePoint, memoryFailuresTags) + points = append(points, memoryFailurePoint) + + s.tagPoints(cInfo, stats, points) + + return points +} + +func (s *influxdbStorage) hugetlbStatsToPoints( + cInfo *info.ContainerInfo, + stats *info.ContainerStats, +) (points []*influxdb.Point) { + + for pageSize, hugetlbStat := range stats.Hugetlb { + tags := map[string]string{ + "page_size": pageSize, + } + + // Hugepage usage + point := makePoint(setHugetlbUsage, hugetlbStat.Usage) + addTagsToPoint(point, tags) + points = append(points, point) + + //Maximum hugepage usage recorded + point = makePoint(setHugetlbMaxUsage, hugetlbStat.MaxUsage) + addTagsToPoint(point, tags) + points = append(points, point) + + // Number of hugepage usage hits limits + point = makePoint(setHugetlbFailcnt, hugetlbStat.Failcnt) + addTagsToPoint(point, tags) + points = append(points, point) + } + + s.tagPoints(cInfo, stats, points) + + return points +} + +func (s *influxdbStorage) perfStatsToPoints( + cInfo *info.ContainerInfo, + stats *info.ContainerStats, +) (points []*influxdb.Point) { + + for _, perfStat := range stats.PerfStats { + point := makePoint(serPerfStat, perfStat.Value) + tags := map[string]string{ + "cpu": fmt.Sprintf("%v", perfStat.Cpu), + "name": perfStat.Name, + "scaling_ratio": fmt.Sprintf("%v", perfStat.ScalingRatio), + } + addTagsToPoint(point, tags) + points = append(points, point) + } + + s.tagPoints(cInfo, stats, points) + + return points +} + +func (s *influxdbStorage) resctrlStatsToPoints( + cInfo *info.ContainerInfo, + stats *info.ContainerStats, +) (points []*influxdb.Point) { + + // Memory bandwidth + for nodeID, rdtMemoryBandwidth := range stats.Resctrl.MemoryBandwidth { + tags := map[string]string{ + "node_id": fmt.Sprintf("%v", nodeID), + } + point := makePoint(serResctrlMemoryBandwidthTotal, rdtMemoryBandwidth.TotalBytes) + addTagsToPoint(point, tags) + points = append(points, point) + + point = makePoint(serResctrlMemoryBandwidthLocal, rdtMemoryBandwidth.LocalBytes) + addTagsToPoint(point, tags) + points = append(points, point) + } + + // Cache + for nodeID, rdtCache := range stats.Resctrl.Cache { + tags := map[string]string{ + "node_id": fmt.Sprintf("%v", nodeID), + } + point := makePoint(serResctrlLLCOccupancy, rdtCache.LLCOccupancy) + addTagsToPoint(point, tags) + points = append(points, point) + } + s.tagPoints(cInfo, stats, points) return points @@ -230,6 +394,10 @@ func (s *influxdbStorage) AddStats(cInfo *info.ContainerInfo, stats *info.Contai defer s.lock.Unlock() s.points = append(s.points, s.containerStatsToPoints(cInfo, stats)...) + s.points = append(s.points, s.memoryStatsToPoints(cInfo, stats)...) + s.points = append(s.points, s.hugetlbStatsToPoints(cInfo, stats)...) + s.points = append(s.points, s.perfStatsToPoints(cInfo, stats)...) + s.points = append(s.points, s.resctrlStatsToPoints(cInfo, stats)...) s.points = append(s.points, s.containerFilesystemStatsToPoints(cInfo, stats)...) if s.readyToFlush() { pointsToFlush = s.points diff --git a/cmd/internal/storage/influxdb/influxdb_test.go b/cmd/internal/storage/influxdb/influxdb_test.go index 01aa5882..9e8a58f9 100644 --- a/cmd/internal/storage/influxdb/influxdb_test.go +++ b/cmd/internal/storage/influxdb/influxdb_test.go @@ -199,11 +199,14 @@ func TestContainerFileSystemStatsToPoints(t *testing.T) { false, 2*time.Minute) assert.Nil(err) - ref := info.ContainerReference{ - Name: "containerName", + cInfo := &info.ContainerInfo{ + ContainerReference: info.ContainerReference{ + Name: "containerName", + }, } + stats := &info.ContainerStats{} - points := storage.containerFilesystemStatsToPoints(ref, stats) + points := storage.containerFilesystemStatsToPoints(cInfo, stats) // stats.Filesystem is always nil, not sure why assert.Nil(points) @@ -215,30 +218,75 @@ func TestContainerStatsToPoints(t *testing.T) { require.Nil(t, err) require.NotNil(t, storage) - ref, stats := createTestStats() + cInfo, stats := createTestStats() require.Nil(t, err) require.NotNil(t, stats) // When - points := storage.containerStatsToPoints(*ref, stats) + points := storage.containerStatsToPoints(cInfo, stats) + points = append(points, storage.memoryStatsToPoints(cInfo, stats)...) + points = append(points, storage.hugetlbStatsToPoints(cInfo, stats)...) + points = append(points, storage.perfStatsToPoints(cInfo, stats)...) + points = append(points, storage.resctrlStatsToPoints(cInfo, stats)...) // Then assert.NotEmpty(t, points) - assert.Len(t, points, 10+len(stats.Cpu.Usage.PerCpu)) + assert.Len(t, points, 34+len(stats.Cpu.Usage.PerCpu)) + // CPU stats assertContainsPointWithValue(t, points, serCpuUsageTotal, stats.Cpu.Usage.Total) assertContainsPointWithValue(t, points, serCpuUsageSystem, stats.Cpu.Usage.System) assertContainsPointWithValue(t, points, serCpuUsageUser, stats.Cpu.Usage.User) - assertContainsPointWithValue(t, points, serMemoryUsage, stats.Memory.Usage) assertContainsPointWithValue(t, points, serLoadAverage, stats.Cpu.LoadAverage) + for _, cpu_usage := range stats.Cpu.Usage.PerCpu { + assertContainsPointWithValue(t, points, serCpuUsagePerCpu, cpu_usage) + } + + // Memory stats + assertContainsPointWithValue(t, points, serMemoryUsage, stats.Memory.Usage) + assertContainsPointWithValue(t, points, serMemoryMaxUsage, stats.Memory.MaxUsage) + assertContainsPointWithValue(t, points, serMemoryCache, stats.Memory.Cache) + assertContainsPointWithValue(t, points, serMemoryRss, stats.Memory.RSS) + assertContainsPointWithValue(t, points, serMemorySwap, stats.Memory.Swap) + assertContainsPointWithValue(t, points, serMemoryMappedFile, stats.Memory.MappedFile) + assertContainsPointWithValue(t, points, serMemoryUsage, stats.Memory.Usage) assertContainsPointWithValue(t, points, serMemoryWorkingSet, stats.Memory.WorkingSet) + assertContainsPointWithValue(t, points, serMemoryFailcnt, stats.Memory.Failcnt) + assertContainsPointWithValue(t, points, serMemoryFailure, stats.Memory.ContainerData.Pgfault) + assertContainsPointWithValue(t, points, serMemoryFailure, stats.Memory.ContainerData.Pgmajfault) + assertContainsPointWithValue(t, points, serMemoryFailure, stats.Memory.HierarchicalData.Pgfault) + assertContainsPointWithValue(t, points, serMemoryFailure, stats.Memory.HierarchicalData.Pgmajfault) + + // Hugetlb stats + for _, hugetlbStat := range stats.Hugetlb { + assertContainsPointWithValue(t, points, setHugetlbUsage, hugetlbStat.Usage) + assertContainsPointWithValue(t, points, setHugetlbMaxUsage, hugetlbStat.MaxUsage) + assertContainsPointWithValue(t, points, setHugetlbFailcnt, hugetlbStat.Failcnt) + } + + // Network stats assertContainsPointWithValue(t, points, serRxBytes, stats.Network.RxBytes) assertContainsPointWithValue(t, points, serRxErrors, stats.Network.RxErrors) assertContainsPointWithValue(t, points, serTxBytes, stats.Network.TxBytes) assertContainsPointWithValue(t, points, serTxBytes, stats.Network.TxErrors) - for _, cpu_usage := range stats.Cpu.Usage.PerCpu { - assertContainsPointWithValue(t, points, serCpuUsagePerCpu, cpu_usage) + // Perf stats + for _, perfStat := range stats.PerfStats { + assertContainsPointWithValue(t, points, serPerfStat, perfStat.Value) + } + + // Reference memory + assertContainsPointWithValue(t, points, serReferencedMemory, stats.ReferencedMemory) + + // Resource Control stats - memory bandwidth + for _, rdtMemoryBandwidth := range stats.Resctrl.MemoryBandwidth { + assertContainsPointWithValue(t, points, serResctrlMemoryBandwidthTotal, rdtMemoryBandwidth.TotalBytes) + assertContainsPointWithValue(t, points, serResctrlMemoryBandwidthLocal, rdtMemoryBandwidth.LocalBytes) + } + + // Resource Control stats - cache + for _, rdtCache := range stats.Resctrl.Cache { + assertContainsPointWithValue(t, points, serResctrlLLCOccupancy, rdtCache.LLCOccupancy) } } @@ -274,10 +322,12 @@ func createTestStorage() (*influxdbStorage, error) { return storage, err } -func createTestStats() (*info.ContainerReference, *info.ContainerStats) { - ref := &info.ContainerReference{ - Name: "testContainername", - Aliases: []string{"testContainerAlias1", "testContainerAlias2"}, +func createTestStats() (*info.ContainerInfo, *info.ContainerStats) { + cInfo := &info.ContainerInfo{ + ContainerReference: info.ContainerReference{ + Name: "testContainername", + Aliases: []string{"testContainerAlias1", "testContainerAlias2"}, + }, } cpuUsage := info.CpuUsage{ @@ -293,6 +343,34 @@ func createTestStats() (*info.ContainerReference, *info.ContainerStats) { Usage: cpuUsage, LoadAverage: int32(rand.Intn(1000)), }, + Memory: info.MemoryStats{ + Usage: 26767396864, + MaxUsage: 30429605888, + Cache: 7837376512, + RSS: 18930020352, + Swap: 1024, + MappedFile: 1025327104, + WorkingSet: 23630012416, + Failcnt: 1, + ContainerData: info.MemoryStatsMemoryData{Pgfault: 100328455, Pgmajfault: 97}, + HierarchicalData: info.MemoryStatsMemoryData{Pgfault: 100328454, Pgmajfault: 96}, + }, + Hugetlb: map[string]info.HugetlbStats{ + "1GB": {Usage: 1234, MaxUsage: 5678, Failcnt: 9}, + "2GB": {Usage: 9876, MaxUsage: 5432, Failcnt: 1}, + }, + ReferencedMemory: 12345, + PerfStats: []info.PerfStat{{Cpu: 1, Name: "cycles", ScalingRatio: 1.5, Value: 4589}}, + Resctrl: info.ResctrlStats{ + MemoryBandwidth: []info.MemoryBandwidthStats{ + {TotalBytes: 11234, LocalBytes: 4567}, + {TotalBytes: 55678, LocalBytes: 9876}, + }, + Cache: []info.CacheStats{ + {LLCOccupancy: 3}, + {LLCOccupancy: 5}, + }, + }, } - return ref, stats + return cInfo, stats }