Basis for scaleable cloud/container running

MetPX · Oct 20, 2023 · 0de3b65 · 0de3b65
1 parent dedb8d2
commit 0de3b65
Show file tree

Hide file tree

Showing 5 changed files with 311 additions and 0 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,15 @@
+cache
+.github
+.pytest_cache
+.vscode
+data
+debian
+!debian/changelog
+docker
+docs
+obsolete
+s3-publisher
+tests
+tools
+travis
+vagrant
diff --git a/docker/cloud/Dockerfile b/docker/cloud/Dockerfile
@@ -0,0 +1,25 @@
+FROM ubuntu:latest
+
+LABEL org.opencontainers.image.authors="[email protected]"
+
+ENV TZ="Etc/UTC" \
+    DEBIAN_FRONTEND="noninteractive" \
+    BUILD_PACKAGES="build-essential" 
+
+RUN apt-get update &&\ 
+    apt-get install -y python3-amqp python3-appdirs python3-dateparser python3-humanfriendly \
+        python3-humanize python3-jsonpickle python3-netifaces python3-paramiko python3-pip \
+        python3-psutil python3-watchdog python3-magic
+
+# need version >= 1.5.1 to get MQTT v5 support, not in repos of 20.04 ... so get from pip.
+RUN pip3 install paho-mqtt redis python-redis-lock
+
+WORKDIR /src
+
+COPY . /src
+
+RUN python3 setup.py install
+
+WORKDIR /root
+
+CMD sr3 foreground --config $SR3_CONFIG --logStdout --no $SR3_INSTANCE
diff --git a/docker/cloud/dd_all.conf b/docker/cloud/dd_all.conf
@@ -0,0 +1,35 @@
+# this fetches everything on the server.
+
+# a good first test if you need to validate parameters.
+# if the server is working at all this will download a lot.
+#  recommend using with -n (discard) so that you only see the notices.
+#
+broker amqps://dd.weather.gc.ca/
+topicPrefix v02.post
+
+#instancies is the number of downloaders to run at once. Defaults to 1, but likely need more.
+#increase if you see high "lag" times in download logs.
+instances 1
+
+#expire, in operations should be longer than longest expected interruption.
+expire 10m
+
+subtopic #
+
+directory /tmp/dd_all
+
+discard true
+queueName q_anonymous_subscribe.dd_all.40026876.58034545
+
+retry_driver redis
+redisqueue_serverurl redis://:SuperSecure@redis:6379/0
+nodupe_driver redis
+nodupe_redis_serverurl redis://:SuperSecure@redis:6379/0
+
+nodupe_ttl 1000d
+
+#It would help to get metrics out of the instances if we could set the metricsFilename
+#Currently it's statically derived from pidFilename (which also isn't configurable)
+#metricsFilename /tmp/metrics_subscribe.dd_all.${SR_INSTANCE}
+
+#debug true
diff --git a/docker/cloud/docker-compose.yml b/docker/cloud/docker-compose.yml
@@ -0,0 +1,29 @@
+version: "3.8"
+
+services:
+  ddall:
+    build: ../../  
+    deploy:
+      mode: replicated
+      replicas: 0
+    environment:
+      - SR3_INSTANCE={{.Task.Slot}}
+      - SR3_CONFIG=subscribe/dd_all.conf
+    volumes:
+      - ./dd_all.conf:/root/.config/sr3/subscribe/dd_all.conf
+      #- ./metrics/subscribe_dd_all_{{.Task.Slot | printf "%02s"}}.metrics:/root/.cache/sr3/subscribe/dd_all/subscribe_dd_all_{{.Task.Slot | printf "%02s"}}.metrics
+
+  redis:
+    restart: unless-stopped
+    image: redis
+    deploy:
+      mode: replicated
+      replicas: 0
+    command: redis-server --requirepass SuperSecure
+    ports:
+      - 16379:6379
+    volumes:
+      - redis:/data
+
+volumes:
+  redis:
diff --git a/docker/cloud/telegraf.conf b/docker/cloud/telegraf.conf
@@ -0,0 +1,207 @@
+# Telegraf Configuration
+#
+# Telegraf is entirely plugin driven. All metrics are gathered from the
+# declared inputs, and sent to the declared outputs.
+#
+# Plugins must be declared in here to be active.
+# To deactivate a plugin, comment out the name and any variables.
+#
+# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
+# file would generate.
+#
+# Environment variables can be used anywhere in this config file, simply prepend
+# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"),
+# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR)
+
+
+# Global tags can be specified here in key="value" format.
+[global_tags]
+  # dc = "us-east-1" # will tag all metrics with dc=us-east-1
+  # rack = "1a"
+  ## Environment variables can be used as tags, and throughout the config file
+  # user = "$USER"
+#  platform = "linux"
+#  role = "server"
+
+
+# Configuration for telegraf agent
+[agent]
+  ## Default data collection interval for all inputs
+  interval = "5s"
+  ## Rounds collection interval to 'interval'
+  ## ie, if interval="10s" then always collect on :00, :10, :20, etc.
+  round_interval = true
+
+  ## Telegraf will send metrics to outputs in batches of at most
+  ## metric_batch_size metrics.
+  ## This controls the size of writes that Telegraf sends to output plugins.
+  metric_batch_size = 1000
+
+  ## For failed writes, telegraf will cache metric_buffer_limit metrics for each
+  ## output, and will flush this buffer on a successful write. Oldest metrics
+  ## are dropped first when this buffer fills.
+  ## This buffer only fills when writes fail to output plugin(s).
+  metric_buffer_limit = 10000
+
+  ## Collection jitter is used to jitter the collection by a random amount.
+  ## Each plugin will sleep for a random time within jitter before collecting.
+  ## This can be used to avoid many plugins querying things like sysfs at the
+  ## same time, which can have a measurable effect on the system.
+  collection_jitter = "0s"
+
+  ## Default flushing interval for all outputs. Maximum flush_interval will be
+  ## flush_interval + flush_jitter
+  flush_interval = "5s"
+  ## Jitter the flush interval by a random amount. This is primarily to avoid
+  ## large write spikes for users running a large number of telegraf instances.
+  ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
+  flush_jitter = "0s"
+
+  ## By default or when set to "0s", precision will be set to the same
+  ## timestamp order as the collection interval, with the maximum being 1s.
+  ##   ie, when interval = "10s", precision will be "1s"
+  ##       when interval = "250ms", precision will be "1ms"
+  ## Precision will NOT be used for service inputs. It is up to each individual
+  ## service input to set the timestamp at the appropriate precision.
+  ## Valid time units are "ns", "us" (or "µs"), "ms", "s".
+  precision = ""
+
+  ## Logging configuration:
+  ## Run telegraf with debug log messages.
+  debug = false
+  ## Run telegraf in quiet mode (error log messages only).
+  quiet = false
+  ## Specify the log file name. The empty string means to log to stderr.
+  logfile = ""
+
+  ## Override default hostname, if empty use os.Hostname()
+  hostname = "sr3"
+  ## If set to true, do no set the "host" tag in the telegraf agent.
+  omit_hostname = true
+
+
+###############################################################################
+#                            OUTPUT PLUGINS                                   #
+###############################################################################
+
+## Configuration for Graphite server to send metrics to
+#[[outputs.graphite]]
+#  ## TCP endpoint for your graphite instance.
+#  ## If multiple endpoints are configured, output will be load balanced.
+#  ## Only one of the endpoints will be written to with each iteration.
+#  servers = ["swarm.int.thelintons.ca:2003"]
+#  ## Prefix metrics name
+#  prefix = "servers.lin."
+#  ## Graphite output template
+#  ## see https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md
+#  template = "host.measurement.tags.field"
+#
+#  ## Enable Graphite tags support
+#  graphite_tag_support = false
+#
+#  ## timeout in seconds for the write connection to graphite
+#  timeout = 2
+#  tagexclude = [ "platform", "role" ]
+
+#[[outputs.http]]
+#  ## URL is the address to send metrics to
+#  url = "http://swarm.int.thelintons.ca:2007"
+#  data_format = "json"
+
+
+[[outputs.file]]
+  files = [ "stdout" ]
+###  data_format = "graphite"
+###  graphite_tag_support = true
+###  graphite_tag_sanitize_mode = "compatible"
+###  #tagexclude = [ "platform", "role" ]
+
+###############################################################################
+#                            INPUT PLUGINS                                    #
+###############################################################################
+
+# Collect statistics about itself
+[[inputs.internal]]
+  ## If true, collect telegraf memory stats.
+  # collect_memstats = true
+  name_override = "telegraf"
+  tagexclude = [ "version", "go_version" ]
+
+# Parse a complete file each interval
+[[inputs.file]]
+  #alias = "sr3metrics"
+  #name_suffix = "_sr3metrics"
+  name_override = "sr3metrics"
+  ## Files to parse each interval.  Accept standard unix glob matching rules,
+  ## as well as ** to match recursive files and directories.
+  files = ["/home/me/metrics/*.metrics"]
+
+  ## Character encoding to use when interpreting the file contents.  Invalid
+  ## characters are replaced using the unicode replacement character.  When set
+  ## to the empty string the data is not decoded to text.
+  ##   ex: character_encoding = "utf-8"
+  ##       character_encoding = "utf-16le"
+  ##       character_encoding = "utf-16be"
+  ##       character_encoding = ""
+  # character_encoding = ""
+
+  ## Name a tag containing the name of the file the data was parsed from.  Leave empty
+  ## to disable. Cautious when file name variation is high, this can increase the cardinality
+  ## significantly. Read more about cardinality here:
+  ## https://docs.influxdata.com/influxdb/cloud/reference/glossary/#series-cardinality
+  file_tag = "filename"
+
+  ## Data format to consume.
+  ## Each data format has its own unique set of configuration options, read
+  ## more about them here:
+  ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
+  data_format = "json"
+  fielddrop = ["flow_current_sleep", "flow_last_housekeeping", "flow_next_housekeeping", "flow_stop_requested", 
+                  "flow_transferConnected", "flow_transferConnectStart",
+                "gather.message_connected", "gather.message_disconnectLast", "gather.message_disconnectTime"
+                ]
+
+
+  #data_format = "json_v2"
+  # [[inputs.file.json_v2]]
+  #   [[inputs.file.json_v2.object]]
+  #     #measurement_name = "sr3_flow"
+  #     excluded_keys = ["current_sleep", "next_housekeeping", "stop_requested", "last_housekeeping", "transferConnected"]
+  #     path = "flow"
+  #     #disable_prepend_keys = true
+  #   [[inputs.file.json_v2.object]]
+  #     #measurement_name = "sr3_gather"
+  #     excluded_keys = ["connected", "disconnectLast", "disconnectTime"]
+  #     path = "gather\\.message"
+  #     #disable_prepend_keys = true
+
+
+
+[[processors.regex]]
+  #namepass = ["file"]
+  [[processors.regex.tags]]
+    key = "filename"
+    # subscribe_dd_all_01.metrics
+    pattern = '^(?P<component>cpost|cpump|flow|poll|post|report|sarra|sender|shovel|subscribe|watch|winnow)_(?P<config>.*)_(?P<instance>[0-9]{2})\.metrics$'
+    #pattern = '^(cpost|cpump|flow|poll|post|report|sarra|sender|shovel|subscribe|watch|winnow)_(.*)_([0-9]{2})\.metrics$'
+    replacement = "${1}"
+    result_key = "component"
+
+  [[processors.regex.tags]]
+    key = "filename"
+    # subscribe_dd_all_01.metrics
+    pattern = '^(?P<component>cpost|cpump|flow|poll|post|report|sarra|sender|shovel|subscribe|watch|winnow)_(?P<config>.*)_(?P<instance>[0-9]{2})\.metrics$'
+    #pattern = '^(cpost|cpump|flow|poll|post|report|sarra|sender|shovel|subscribe|watch|winnow)_(.*)_([0-9]{2})\.metrics$'
+    replacement = "${2}"
+    result_key = "config"
+
+  [[processors.regex.tags]]
+    key = "filename"
+    # subscribe_dd_all_01.metrics
+    pattern = '^(?P<component>cpost|cpump|flow|poll|post|report|sarra|sender|shovel|subscribe|watch|winnow)_(?P<config>.*)_(?P<instance>[0-9]{2})\.metrics$'
+    #pattern = '^(cpost|cpump|flow|poll|post|report|sarra|sender|shovel|subscribe|watch|winnow)_(.*)_([0-9]{2})\.metrics$'
+    replacement = "${3}"
+    result_key = "instance"
+
+
+