Skip to content

Commit

Permalink
Rebase GPU preview branch (#1330)
Browse files Browse the repository at this point in the history
  • Loading branch information
LujieDuan authored Aug 15, 2023
1 parent 3da5be0 commit 3136314
Show file tree
Hide file tree
Showing 1,843 changed files with 169,441 additions and 16 deletions.
70 changes: 70 additions & 0 deletions apps/dcgm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package apps

import (
"context"

"github.com/GoogleCloudPlatform/ops-agent/confgenerator"
"github.com/GoogleCloudPlatform/ops-agent/confgenerator/otel"
"github.com/GoogleCloudPlatform/ops-agent/internal/platform"
)

type MetricsReceiverDcgm struct {
confgenerator.ConfigComponent `yaml:",inline"`
confgenerator.MetricsReceiverShared `yaml:",inline"`

Endpoint string `yaml:"endpoint" validate:"omitempty,hostname_port"`
}

const defaultDcgmEndpoint = "localhost:5555"

func (r MetricsReceiverDcgm) Type() string {
return "dcgm"
}

func (r MetricsReceiverDcgm) Pipelines(_ context.Context) []otel.ReceiverPipeline {
if r.Endpoint == "" {
r.Endpoint = defaultDcgmEndpoint
}

return []otel.ReceiverPipeline{{
Receiver: otel.Component{
Type: "dcgm",
Config: map[string]interface{}{
"collection_interval": r.CollectionIntervalString(),
"endpoint": r.Endpoint,
"metrics": map[string]interface{}{
"dcgm.gpu.utilization": map[string]bool{
"enabled": false,
},
"dcgm.gpu.memory.bytes_used": map[string]bool{
"enabled": false,
},
},
},
},
Processors: map[string][]otel.Component{"metrics": {
otel.MetricsTransform(
otel.AddPrefix("workload.googleapis.com"),
),
otel.ModifyInstrumentationScope(r.Type(), "1.0"),
}},
}}
}

func init() {
confgenerator.MetricsReceiverTypes.RegisterType(func() confgenerator.MetricsReceiver { return &MetricsReceiverDcgm{} }, platform.Linux)
}
41 changes: 40 additions & 1 deletion apps/hostmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ func (r MetricsReceiverHostmetrics) Pipelines(ctx context.Context) []otel.Receiv
)
}
transforms = append(transforms, otel.AddPrefix("agent.googleapis.com"))
return []otel.ReceiverPipeline{{
pipelines := []otel.ReceiverPipeline{{
Receiver: otel.Component{
Type: "hostmetrics",
Config: map[string]interface{}{
Expand Down Expand Up @@ -326,6 +326,45 @@ func (r MetricsReceiverHostmetrics) Pipelines(ctx context.Context) []otel.Receiv
otel.MetricsTransform(transforms...),
}},
}}

if p.HasNvidiaGpu {
pipelines = append(pipelines, otel.ReceiverPipeline{
Receiver: otel.Component{
Type: "nvml",
Config: map[string]interface{}{
"collection_interval": r.CollectionIntervalString(),
},
},
ExporterTypes: map[string]otel.ExporterType{
"metrics": otel.System,
},
Processors: map[string][]otel.Component{"metrics": {
otel.MetricsTransform(
otel.RenameMetric(
"nvml.gpu.utilization",
"gpu/utilization",
otel.ScaleValue(100),
),
otel.RenameMetric(
"nvml.gpu.memory.bytes_used",
"gpu/memory/bytes_used",
),
otel.RenameMetric(
"nvml.gpu.processes.utilization",
"gpu/processes/utilization",
otel.ScaleValue(100),
),
otel.RenameMetric(
"nvml.gpu.processes.max_bytes_used",
"gpu/processes/max_bytes_used",
),
otel.AddPrefix("agent.googleapis.com"),
),
}},
})
}

return pipelines
}

func init() {
Expand Down
14 changes: 14 additions & 0 deletions confgenerator/confgenerator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,20 @@ var (
},
},
},
{
name: "linux-gpu",
defaultLogsDir: "/var/log/google-cloud-ops-agent",
defaultStateDir: "/var/lib/google-cloud-ops-agent/fluent-bit",
platform: platform.Platform{
Type: platform.Linux,
HostInfo: &host.InfoStat{
OS: "linux",
Platform: "linux_platform",
PlatformVersion: "linux_platform_version",
},
HasNvidiaGpu: true,
},
},
{
name: "windows",
defaultLogsDir: `C:\ProgramData\Google\Cloud Operations\Ops Agent\log`,
Expand Down
1 change: 1 addition & 0 deletions confgenerator/testdata/feature/golden.csv
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ App,Field,Override,
*apps.MetricsReceiverCassandra,confgenerator.MetricsReceiverSharedCollectJVM.CollectJVMMetrics,
*apps.MetricsReceiverCouchbase,confgenerator.ConfigComponent.Type,
*apps.MetricsReceiverCouchdb,confgenerator.ConfigComponent.Type,
*apps.MetricsReceiverDcgm,confgenerator.ConfigComponent.Type,
*apps.MetricsReceiverElasticsearch,confgenerator.ConfigComponent.Type,
*apps.MetricsReceiverElasticsearch,confgenerator.MetricsReceiverSharedCluster.CollectClusterMetrics,
*apps.MetricsReceiverElasticsearch,confgenerator.MetricsReceiverSharedCollectJVM.CollectJVMMetrics,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

function process(tag, timestamp, record)
local __field_0 = (function()
return record["severity"]
end)();
(function(value)
record["severity"] = value
end)(nil);
local v = __field_0;
if v == "debug" then v = "DEBUG"
elseif v == "error" then v = "ERROR"
elseif v == "info" then v = "INFO"
elseif v == "warn" then v = "WARNING"
end
(function(value)
record["logging.googleapis.com/severity"] = value
end)(v)
return 2, timestamp, record
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@

function shallow_merge(record, parsedRecord)
-- If no exiting record exists
if (record == nil) then
return parsedRecord
end

for k, v in pairs(parsedRecord) do
record[k] = v
end

return record
end

function merge(record, parsedRecord)
-- If no exiting record exists
if record == nil then
return parsedRecord
end

-- Potentially overwrite or merge the original records.
for k, v in pairs(parsedRecord) do
-- If there is no conflict
if k == "logging.googleapis.com/logName" then
-- Ignore the parsed payload since the logName is controlled
-- by the OpsAgent.
elseif k == "logging.googleapis.com/labels" then
-- LogEntry.labels are basically a map[string]string and so only require a
-- shallow merge (one level deep merge).
record[k] = shallow_merge(record[k], v)
else
record[k] = v
end
end

return record
end

function parser_merge_record(tag, timestamp, record)
originalPayload = record["logging.googleapis.com/__tmp"]
if originalPayload == nil then
return 0, timestamp, record
end

-- Remove original payload
record["logging.googleapis.com/__tmp"] = nil
record = merge(originalPayload, record)
return 2, timestamp, record
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@

function process(tag, timestamp, record)
local __field_0 = (function()
return record["message"]
end)();
local v = __field_0;
if v == "LogParseErr" then v = "Ops Agent failed to parse logs, Code: LogParseErr, Documentation: https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/troubleshoot-find-info"
elseif v == "LogPipelineErr" then v = "Ops Agent logging pipeline failed, Code: LogPipelineErr, Documentation: https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/troubleshoot-find-info"
end
(function(value)
record["message"] = value
end)(v)
local v = "ops-agent";
(function(value)
if record["logging.googleapis.com/labels"] == nil
then
record["logging.googleapis.com/labels"] = {}
end
record["logging.googleapis.com/labels"]["agent.googleapis.com/health/agentKind"] = value
end)(v)
local v = "latest";
(function(value)
if record["logging.googleapis.com/labels"] == nil
then
record["logging.googleapis.com/labels"] = {}
end
record["logging.googleapis.com/labels"]["agent.googleapis.com/health/agentVersion"] = value
end)(v)
local v = "v1";
(function(value)
if record["logging.googleapis.com/labels"] == nil
then
record["logging.googleapis.com/labels"] = {}
end
record["logging.googleapis.com/labels"]["agent.googleapis.com/health/schemaVersion"] = value
end)(v)
return 2, timestamp, record
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@

function parser_nest(tag, timestamp, record)
local nestedRecord = {}
local parseKey = "message"
for k, v in pairs(record) do
if k ~= parseKey then
nestedRecord[k] = v
end
end

local result = {}
result[parseKey] = record[parseKey]
result["logging.googleapis.com/__tmp"] = nestedRecord

return 2, timestamp, result
end

Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@

function process(tag, timestamp, record)
local __field_0 = (function()
return record["agent.googleapis.com/log_file_path"]
end)();
local __field_1 = (function()
if record["logging.googleapis.com/labels"] == nil
then
return nil
end
return record["logging.googleapis.com/labels"]["compute.googleapis.com/resource_name"]
end)();
local __field_2 = (function()
return record["logging.googleapis.com/logName"]
end)();
(function(value)
record["agent.googleapis.com/log_file_path"] = value
end)(nil);
local v = __field_0;
(function(value)
if record["logging.googleapis.com/labels"] == nil
then
record["logging.googleapis.com/labels"] = {}
end
record["logging.googleapis.com/labels"]["agent.googleapis.com/log_file_path"] = value
end)(v)
local v = __field_1;
if v == nil then v = "" end;
(function(value)
if record["logging.googleapis.com/labels"] == nil
then
record["logging.googleapis.com/labels"] = {}
end
record["logging.googleapis.com/labels"]["compute.googleapis.com/resource_name"] = value
end)(v)
local v = __field_2;
if v == nil then v = "syslog" end;
(function(value)
record["logging.googleapis.com/logName"] = value
end)(v)
return 2, timestamp, record
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
- module: logging
feature: service:pipelines
key: default_pipeline_overridden
value: "true"
- module: metrics
feature: service:pipelines
key: default_pipeline_overridden
value: "true"
- module: metrics
feature: receivers:hostmetrics
key: "[0].enabled"
value: "true"
- module: logging
feature: receivers:files
key: "[0].enabled"
value: "true"
- module: logging
feature: receivers:files
key: "[0].include_paths.__length"
value: "2"
Loading

0 comments on commit 3136314

Please sign in to comment.