Skip to content

Commit 7441665

Browse files
sam6134movenceaditya-purang
authored
Add Neuron Scraper for scraping neuron monitor metrics (#184)
* Add Neuron monitor scraper to scrape metrics from NeuronMonitor --------- Co-authored-by: Hyunsoo Kim <[email protected]> Co-authored-by: Aditya Purang <[email protected]>
1 parent 34bd73a commit 7441665

23 files changed

+1323
-396
lines changed

exporter/awsemfexporter/internal/appsignals/useragent.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Copyright The OpenTelemetry Authors
22
// SPDX-License-Identifier: Apache-2.0
33

4-
package appsignals
4+
package appsignals // import "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter/internal/appsignals"
55

66
import (
77
"context"

exporter/awsemfexporter/metric_translator.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,8 @@ func (mt metricTranslator) translateOTelToGroupedMetric(rm pmetric.ResourceMetri
132132

133133
if serviceName, ok := rm.Resource().Attributes().Get("service.name"); ok {
134134
if strings.HasPrefix(serviceName.Str(), "containerInsightsKubeAPIServerScraper") ||
135-
strings.HasPrefix(serviceName.Str(), "containerInsightsDCGMExporterScraper") {
135+
strings.HasPrefix(serviceName.Str(), "containerInsightsDCGMExporterScraper") ||
136+
strings.HasPrefix(serviceName.Str(), "containerInsightsNeuronMonitorScraper") {
136137
// the prometheus metrics that come from the container insight receiver need to be clearly tagged as coming from container insights
137138
metricReceiver = containerInsightsReceiver
138139
}

exporter/awsemfexporter/metric_translator_test.go

+16
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,8 @@ func TestTranslateOtToGroupedMetric(t *testing.T) {
282282
containerInsightMetric.Resource().Attributes().PutStr(conventions.AttributeServiceName, "containerInsightsKubeAPIServerScraper")
283283
gpuMetric := createTestResourceMetricsHelper(defaultNumberOfTestMetrics + 1)
284284
gpuMetric.Resource().Attributes().PutStr(conventions.AttributeServiceName, "containerInsightsDCGMExporterScraper")
285+
neuronMetric := createTestResourceMetricsHelper(defaultNumberOfTestMetrics + 1)
286+
neuronMetric.Resource().Attributes().PutStr(conventions.AttributeServiceName, "containerInsightsNeuronMonitorScraper")
285287

286288
counterSumMetrics := map[string]*metricInfo{
287289
"spanCounter": {
@@ -390,6 +392,20 @@ func TestTranslateOtToGroupedMetric(t *testing.T) {
390392
"myServiceNS/containerInsightsDCGMExporterScraper",
391393
containerInsightsReceiver,
392394
},
395+
{
396+
"neuron monitor receiver",
397+
neuronMetric,
398+
map[string]string{
399+
"isItAnError": "false",
400+
"spanName": "testSpan",
401+
},
402+
map[string]string{
403+
oTellibDimensionKey: "cloudwatch-lib",
404+
"spanName": "testSpan",
405+
},
406+
"myServiceNS/containerInsightsNeuronMonitorScraper",
407+
containerInsightsReceiver,
408+
},
393409
}
394410

395411
for _, tc := range testCases {

internal/aws/containerinsight/const.go

+6-5
Original file line numberDiff line numberDiff line change
@@ -157,11 +157,12 @@ const (
157157
TypeContainerDiskIO = "ContainerDiskIO"
158158
// Special type for pause container
159159
// because containerd does not set container name pause container name to POD like docker does.
160-
TypeInfraContainer = "InfraContainer"
161-
TypeGpuContainer = "ContainerGPU"
162-
TypeGpuPod = "PodGPU"
163-
TypeGpuNode = "NodeGPU"
164-
TypeGpuCluster = "ClusterGPU"
160+
TypeInfraContainer = "InfraContainer"
161+
TypeGpuContainer = "ContainerGPU"
162+
TypeGpuPod = "PodGPU"
163+
TypeGpuNode = "NodeGPU"
164+
TypeGpuCluster = "ClusterGPU"
165+
TypeNeuronContainer = "ContainerNeuron"
165166

166167
// unit
167168
UnitBytes = "Bytes"

receiver/awscontainerinsightreceiver/internal/cadvisor/extractors/extractorhelpers.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Copyright The OpenTelemetry Authors
22
// SPDX-License-Identifier: Apache-2.0
33

4-
package extractors
4+
package extractors // import "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver/internal/cadvisor/extractors"
55

66
import (
77
"fmt"
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
// Copyright The OpenTelemetry Authors
22
// SPDX-License-Identifier: Apache-2.0
33

4-
package gpu
4+
package gpu // import "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver/internal/gpu"
55

66
import (
7-
"context"
8-
"errors"
9-
"fmt"
107
"time"
118

129
configutil "github.com/prometheus/common/config"
@@ -15,14 +12,8 @@ import (
1512
"github.com/prometheus/prometheus/discovery"
1613
"github.com/prometheus/prometheus/discovery/kubernetes"
1714
"github.com/prometheus/prometheus/model/relabel"
18-
"go.opentelemetry.io/collector/component"
19-
"go.opentelemetry.io/collector/consumer"
20-
"go.opentelemetry.io/collector/pdata/pmetric"
21-
"go.opentelemetry.io/collector/receiver"
22-
"go.uber.org/zap"
2315

2416
ci "github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/containerinsight"
25-
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prometheusreceiver"
2617
)
2718

2819
const (
@@ -33,77 +24,13 @@ const (
3324
scraperK8sServiceSelector = "k8s-app=dcgm-exporter-service"
3425
)
3526

36-
type DcgmScraper struct {
37-
ctx context.Context
38-
settings component.TelemetrySettings
39-
host component.Host
40-
hostInfoProvider hostInfoProvider
41-
prometheusReceiver receiver.Metrics
42-
k8sDecorator Decorator
43-
running bool
44-
}
45-
46-
type DcgmScraperOpts struct {
47-
Ctx context.Context
48-
TelemetrySettings component.TelemetrySettings
49-
Consumer consumer.Metrics
50-
Host component.Host
51-
HostInfoProvider hostInfoProvider
52-
K8sDecorator Decorator
53-
Logger *zap.Logger
54-
}
55-
5627
type hostInfoProvider interface {
5728
GetClusterName() string
5829
GetInstanceID() string
5930
GetInstanceType() string
6031
}
6132

62-
func NewDcgmScraper(opts DcgmScraperOpts) (*DcgmScraper, error) {
63-
if opts.Consumer == nil {
64-
return nil, errors.New("consumer cannot be nil")
65-
}
66-
if opts.Host == nil {
67-
return nil, errors.New("host cannot be nil")
68-
}
69-
if opts.HostInfoProvider == nil {
70-
return nil, errors.New("cluster name provider cannot be nil")
71-
}
72-
73-
promConfig := prometheusreceiver.Config{
74-
PrometheusConfig: &config.Config{
75-
ScrapeConfigs: []*config.ScrapeConfig{getScraperConfig(opts.HostInfoProvider)},
76-
},
77-
}
78-
79-
params := receiver.CreateSettings{
80-
TelemetrySettings: opts.TelemetrySettings,
81-
}
82-
83-
decoConsumer := decorateConsumer{
84-
containerOrchestrator: ci.EKS,
85-
nextConsumer: opts.Consumer,
86-
k8sDecorator: opts.K8sDecorator,
87-
logger: opts.Logger,
88-
}
89-
90-
promFactory := prometheusreceiver.NewFactory()
91-
promReceiver, err := promFactory.CreateMetricsReceiver(opts.Ctx, params, &promConfig, &decoConsumer)
92-
if err != nil {
93-
return nil, fmt.Errorf("failed to create prometheus receiver: %w", err)
94-
}
95-
96-
return &DcgmScraper{
97-
ctx: opts.Ctx,
98-
settings: opts.TelemetrySettings,
99-
host: opts.Host,
100-
hostInfoProvider: opts.HostInfoProvider,
101-
prometheusReceiver: promReceiver,
102-
k8sDecorator: opts.K8sDecorator,
103-
}, nil
104-
}
105-
106-
func getScraperConfig(hostInfoProvider hostInfoProvider) *config.ScrapeConfig {
33+
func GetScraperConfig(hostInfoProvider hostInfoProvider) *config.ScrapeConfig {
10734
return &config.ScrapeConfig{
10835
HTTPClientConfig: configutil.HTTPClientConfig{
10936
TLSConfig: configutil.TLSConfig{
@@ -209,35 +136,3 @@ func getMetricRelabelConfig(hostInfoProvider hostInfoProvider) []*relabel.Config
209136
},
210137
}
211138
}
212-
213-
func (ds *DcgmScraper) GetMetrics() []pmetric.Metrics {
214-
// This method will never return metrics because the metrics are collected by the scraper.
215-
// This method will ensure the scraper is running
216-
if !ds.running {
217-
ds.settings.Logger.Info("The scraper is not running, starting up the scraper")
218-
err := ds.prometheusReceiver.Start(ds.ctx, ds.host)
219-
if err != nil {
220-
ds.settings.Logger.Error("Unable to start PrometheusReceiver", zap.Error(err))
221-
}
222-
ds.running = err == nil
223-
}
224-
225-
return nil
226-
}
227-
228-
func (ds *DcgmScraper) Shutdown() {
229-
if ds.running {
230-
err := ds.prometheusReceiver.Shutdown(ds.ctx)
231-
if err != nil {
232-
ds.settings.Logger.Error("Unable to shutdown PrometheusReceiver", zap.Error(err))
233-
}
234-
ds.running = false
235-
}
236-
237-
if ds.k8sDecorator != nil {
238-
err := ds.k8sDecorator.Shutdown()
239-
if err != nil {
240-
ds.settings.Logger.Error("Unable to shutdown K8sDecorator", zap.Error(err))
241-
}
242-
}
243-
}

receiver/awscontainerinsightreceiver/internal/gpu/dcgmscraper_test.go

+9-42
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121

2222
ci "github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/containerinsight"
2323
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver/internal/mocks"
24+
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver/internal/prometheusscraper"
2425
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver/internal/stores"
2526
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prometheusreceiver"
2627
)
@@ -108,42 +109,6 @@ func (m mockConsumer) ConsumeMetrics(_ context.Context, md pmetric.Metrics) erro
108109
return nil
109110
}
110111

111-
func TestNewDcgmScraperBadInputs(t *testing.T) {
112-
settings := componenttest.NewNopTelemetrySettings()
113-
settings.Logger, _ = zap.NewDevelopment()
114-
115-
tests := []DcgmScraperOpts{
116-
{
117-
Ctx: context.TODO(),
118-
TelemetrySettings: settings,
119-
Consumer: nil,
120-
Host: componenttest.NewNopHost(),
121-
HostInfoProvider: mockHostInfoProvider{},
122-
},
123-
{
124-
Ctx: context.TODO(),
125-
TelemetrySettings: settings,
126-
Consumer: mockConsumer{},
127-
Host: nil,
128-
HostInfoProvider: mockHostInfoProvider{},
129-
},
130-
{
131-
Ctx: context.TODO(),
132-
TelemetrySettings: settings,
133-
Consumer: mockConsumer{},
134-
Host: componenttest.NewNopHost(),
135-
HostInfoProvider: nil,
136-
},
137-
}
138-
139-
for _, tt := range tests {
140-
scraper, err := NewDcgmScraper(tt)
141-
142-
assert.Error(t, err)
143-
assert.Nil(t, scraper)
144-
}
145-
}
146-
147112
func TestNewDcgmScraperEndToEnd(t *testing.T) {
148113
expected := map[string]struct {
149114
value float64
@@ -189,16 +154,17 @@ func TestNewDcgmScraperEndToEnd(t *testing.T) {
189154
settings := componenttest.NewNopTelemetrySettings()
190155
settings.Logger, _ = zap.NewDevelopment()
191156

192-
scraper, err := NewDcgmScraper(DcgmScraperOpts{
157+
scraper, err := prometheusscraper.NewSimplePrometheusScraper(prometheusscraper.SimplePrometheusScraperOpts{
193158
Ctx: context.TODO(),
194159
TelemetrySettings: settings,
195160
Consumer: consumer,
196161
Host: componenttest.NewNopHost(),
197162
HostInfoProvider: mockHostInfoProvider{},
198-
K8sDecorator: mockDecorator{},
163+
ScraperConfigs: GetScraperConfig(mockHostInfoProvider{}),
164+
Logger: settings.Logger,
199165
})
200166
assert.NoError(t, err)
201-
assert.Equal(t, mockHostInfoProvider{}, scraper.hostInfoProvider)
167+
assert.Equal(t, mockHostInfoProvider{}, scraper.HostInfoProvider)
202168

203169
// build up a new PR
204170
promFactory := prometheusreceiver.NewFactory()
@@ -214,7 +180,7 @@ func TestNewDcgmScraperEndToEnd(t *testing.T) {
214180
mp, cfg, err := mocks.SetupMockPrometheus(targets...)
215181
assert.NoError(t, err)
216182

217-
scrapeConfig := getScraperConfig(scraper.hostInfoProvider)
183+
scrapeConfig := scraper.ScraperConfigs
218184
scrapeConfig.ScrapeInterval = cfg.ScrapeConfigs[0].ScrapeInterval
219185
scrapeConfig.ScrapeTimeout = cfg.ScrapeConfigs[0].ScrapeInterval
220186
scrapeConfig.Scheme = "http"
@@ -245,9 +211,10 @@ func TestNewDcgmScraperEndToEnd(t *testing.T) {
245211

246212
// replace the prom receiver
247213
params := receiver.CreateSettings{
248-
TelemetrySettings: scraper.settings,
214+
TelemetrySettings: scraper.Settings,
249215
}
250-
scraper.prometheusReceiver, err = promFactory.CreateMetricsReceiver(scraper.ctx, params, &promConfig, consumer)
216+
scraper.PrometheusReceiver, err = promFactory.CreateMetricsReceiver(scraper.Ctx, params, &promConfig, consumer)
217+
251218
assert.NoError(t, err)
252219
assert.NotNil(t, mp)
253220
defer mp.Close()

0 commit comments

Comments
 (0)