From 0de3b6547d60b5a7040f5258fa4a1156b5792333 Mon Sep 17 00:00:00 2001 From: Greg Linton Date: Fri, 20 Oct 2023 13:18:31 -0400 Subject: [PATCH] Basis for scaleable cloud/container running --- .dockerignore | 15 +++ docker/cloud/Dockerfile | 25 ++++ docker/cloud/dd_all.conf | 35 ++++++ docker/cloud/docker-compose.yml | 29 +++++ docker/cloud/telegraf.conf | 207 ++++++++++++++++++++++++++++++++ 5 files changed, 311 insertions(+) create mode 100644 .dockerignore create mode 100644 docker/cloud/Dockerfile create mode 100644 docker/cloud/dd_all.conf create mode 100644 docker/cloud/docker-compose.yml create mode 100644 docker/cloud/telegraf.conf diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..e03b71767 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,15 @@ +cache +.github +.pytest_cache +.vscode +data +debian +!debian/changelog +docker +docs +obsolete +s3-publisher +tests +tools +travis +vagrant diff --git a/docker/cloud/Dockerfile b/docker/cloud/Dockerfile new file mode 100644 index 000000000..71b84c311 --- /dev/null +++ b/docker/cloud/Dockerfile @@ -0,0 +1,25 @@ +FROM ubuntu:latest + +LABEL org.opencontainers.image.authors="Peter.Silva@ssc-spc.gc.ca" + +ENV TZ="Etc/UTC" \ + DEBIAN_FRONTEND="noninteractive" \ + BUILD_PACKAGES="build-essential" + +RUN apt-get update &&\ + apt-get install -y python3-amqp python3-appdirs python3-dateparser python3-humanfriendly \ + python3-humanize python3-jsonpickle python3-netifaces python3-paramiko python3-pip \ + python3-psutil python3-watchdog python3-magic + +# need version >= 1.5.1 to get MQTT v5 support, not in repos of 20.04 ... so get from pip. +RUN pip3 install paho-mqtt redis python-redis-lock + +WORKDIR /src + +COPY . /src + +RUN python3 setup.py install + +WORKDIR /root + +CMD sr3 foreground --config $SR3_CONFIG --logStdout --no $SR3_INSTANCE \ No newline at end of file diff --git a/docker/cloud/dd_all.conf b/docker/cloud/dd_all.conf new file mode 100644 index 000000000..62bda2adb --- /dev/null +++ b/docker/cloud/dd_all.conf @@ -0,0 +1,35 @@ +# this fetches everything on the server. + +# a good first test if you need to validate parameters. +# if the server is working at all this will download a lot. +# recommend using with -n (discard) so that you only see the notices. +# +broker amqps://dd.weather.gc.ca/ +topicPrefix v02.post + +#instancies is the number of downloaders to run at once. Defaults to 1, but likely need more. +#increase if you see high "lag" times in download logs. +instances 1 + +#expire, in operations should be longer than longest expected interruption. +expire 10m + +subtopic # + +directory /tmp/dd_all + +discard true +queueName q_anonymous_subscribe.dd_all.40026876.58034545 + +retry_driver redis +redisqueue_serverurl redis://:SuperSecure@redis:6379/0 +nodupe_driver redis +nodupe_redis_serverurl redis://:SuperSecure@redis:6379/0 + +nodupe_ttl 1000d + +#It would help to get metrics out of the instances if we could set the metricsFilename +#Currently it's statically derived from pidFilename (which also isn't configurable) +#metricsFilename /tmp/metrics_subscribe.dd_all.${SR_INSTANCE} + +#debug true diff --git a/docker/cloud/docker-compose.yml b/docker/cloud/docker-compose.yml new file mode 100644 index 000000000..bed2f4155 --- /dev/null +++ b/docker/cloud/docker-compose.yml @@ -0,0 +1,29 @@ +version: "3.8" + +services: + ddall: + build: ../../ + deploy: + mode: replicated + replicas: 0 + environment: + - SR3_INSTANCE={{.Task.Slot}} + - SR3_CONFIG=subscribe/dd_all.conf + volumes: + - ./dd_all.conf:/root/.config/sr3/subscribe/dd_all.conf + #- ./metrics/subscribe_dd_all_{{.Task.Slot | printf "%02s"}}.metrics:/root/.cache/sr3/subscribe/dd_all/subscribe_dd_all_{{.Task.Slot | printf "%02s"}}.metrics + + redis: + restart: unless-stopped + image: redis + deploy: + mode: replicated + replicas: 0 + command: redis-server --requirepass SuperSecure + ports: + - 16379:6379 + volumes: + - redis:/data + +volumes: + redis: \ No newline at end of file diff --git a/docker/cloud/telegraf.conf b/docker/cloud/telegraf.conf new file mode 100644 index 000000000..143b2009b --- /dev/null +++ b/docker/cloud/telegraf.conf @@ -0,0 +1,207 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + + +# Global tags can be specified here in key="value" format. +[global_tags] + # dc = "us-east-1" # will tag all metrics with dc=us-east-1 + # rack = "1a" + ## Environment variables can be used as tags, and throughout the config file + # user = "$USER" +# platform = "linux" +# role = "server" + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "5s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. Maximum flush_interval will be + ## flush_interval + flush_jitter + flush_interval = "5s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = false + ## Specify the log file name. The empty string means to log to stderr. + logfile = "" + + ## Override default hostname, if empty use os.Hostname() + hostname = "sr3" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +## Configuration for Graphite server to send metrics to +#[[outputs.graphite]] +# ## TCP endpoint for your graphite instance. +# ## If multiple endpoints are configured, output will be load balanced. +# ## Only one of the endpoints will be written to with each iteration. +# servers = ["swarm.int.thelintons.ca:2003"] +# ## Prefix metrics name +# prefix = "servers.lin." +# ## Graphite output template +# ## see https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# template = "host.measurement.tags.field" +# +# ## Enable Graphite tags support +# graphite_tag_support = false +# +# ## timeout in seconds for the write connection to graphite +# timeout = 2 +# tagexclude = [ "platform", "role" ] + +#[[outputs.http]] +# ## URL is the address to send metrics to +# url = "http://swarm.int.thelintons.ca:2007" +# data_format = "json" + + +[[outputs.file]] + files = [ "stdout" ] +### data_format = "graphite" +### graphite_tag_support = true +### graphite_tag_sanitize_mode = "compatible" +### #tagexclude = [ "platform", "role" ] + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Collect statistics about itself +[[inputs.internal]] + ## If true, collect telegraf memory stats. + # collect_memstats = true + name_override = "telegraf" + tagexclude = [ "version", "go_version" ] + +# Parse a complete file each interval +[[inputs.file]] + #alias = "sr3metrics" + #name_suffix = "_sr3metrics" + name_override = "sr3metrics" + ## Files to parse each interval. Accept standard unix glob matching rules, + ## as well as ** to match recursive files and directories. + files = ["/home/me/metrics/*.metrics"] + + ## Character encoding to use when interpreting the file contents. Invalid + ## characters are replaced using the unicode replacement character. When set + ## to the empty string the data is not decoded to text. + ## ex: character_encoding = "utf-8" + ## character_encoding = "utf-16le" + ## character_encoding = "utf-16be" + ## character_encoding = "" + # character_encoding = "" + + ## Name a tag containing the name of the file the data was parsed from. Leave empty + ## to disable. Cautious when file name variation is high, this can increase the cardinality + ## significantly. Read more about cardinality here: + ## https://docs.influxdata.com/influxdb/cloud/reference/glossary/#series-cardinality + file_tag = "filename" + + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "json" + fielddrop = ["flow_current_sleep", "flow_last_housekeeping", "flow_next_housekeeping", "flow_stop_requested", + "flow_transferConnected", "flow_transferConnectStart", + "gather.message_connected", "gather.message_disconnectLast", "gather.message_disconnectTime" + ] + + + #data_format = "json_v2" + # [[inputs.file.json_v2]] + # [[inputs.file.json_v2.object]] + # #measurement_name = "sr3_flow" + # excluded_keys = ["current_sleep", "next_housekeeping", "stop_requested", "last_housekeeping", "transferConnected"] + # path = "flow" + # #disable_prepend_keys = true + # [[inputs.file.json_v2.object]] + # #measurement_name = "sr3_gather" + # excluded_keys = ["connected", "disconnectLast", "disconnectTime"] + # path = "gather\\.message" + # #disable_prepend_keys = true + + + +[[processors.regex]] + #namepass = ["file"] + [[processors.regex.tags]] + key = "filename" + # subscribe_dd_all_01.metrics + pattern = '^(?Pcpost|cpump|flow|poll|post|report|sarra|sender|shovel|subscribe|watch|winnow)_(?P.*)_(?P[0-9]{2})\.metrics$' + #pattern = '^(cpost|cpump|flow|poll|post|report|sarra|sender|shovel|subscribe|watch|winnow)_(.*)_([0-9]{2})\.metrics$' + replacement = "${1}" + result_key = "component" + + [[processors.regex.tags]] + key = "filename" + # subscribe_dd_all_01.metrics + pattern = '^(?Pcpost|cpump|flow|poll|post|report|sarra|sender|shovel|subscribe|watch|winnow)_(?P.*)_(?P[0-9]{2})\.metrics$' + #pattern = '^(cpost|cpump|flow|poll|post|report|sarra|sender|shovel|subscribe|watch|winnow)_(.*)_([0-9]{2})\.metrics$' + replacement = "${2}" + result_key = "config" + + [[processors.regex.tags]] + key = "filename" + # subscribe_dd_all_01.metrics + pattern = '^(?Pcpost|cpump|flow|poll|post|report|sarra|sender|shovel|subscribe|watch|winnow)_(?P.*)_(?P[0-9]{2})\.metrics$' + #pattern = '^(cpost|cpump|flow|poll|post|report|sarra|sender|shovel|subscribe|watch|winnow)_(.*)_([0-9]{2})\.metrics$' + replacement = "${3}" + result_key = "instance" + + +