Skip to content

Commit

Permalink
Update TF plan and use history-server rev33
Browse files Browse the repository at this point in the history
Update TF plan to integrate grafana-agent and integration-hub via
logging endpoint.
Update history server to rev33.

Signed-off-by: Robert Gildein <[email protected]>
  • Loading branch information
rgildein committed Nov 19, 2024
1 parent 95ec4b6 commit 786c769
Show file tree
Hide file tree
Showing 9 changed files with 262 additions and 274 deletions.
144 changes: 72 additions & 72 deletions python/tests/integration/test_spark_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,75 +295,75 @@ async def test_spark_logforwaring_to_loki(
assert len(query["data"]["result"]) != 0, "no logs was found"


# @pytest.mark.abort_on_fail
# @pytest.mark.asyncio
# async def test_history_server_metrics_in_cos(ops_test: OpsTest, cos):
# if not cos:
# pytest.skip("Not possible to test without cos")

# cos_model_name = cos
# # Prometheus data is being published by the app
# assert await all_prometheus_exporters_data(
# ops_test, check_field="jmx_scrape_duration_seconds", app_name=HISTORY_SERVER
# )

# # We should leave time for Prometheus data to be published
# for attempt in Retrying(stop=stop_after_attempt(5), wait=wait_fixed(30)):
# with attempt:

# cos_address = await get_cos_address(ops_test, cos_model_name=cos_model_name)
# assert published_prometheus_data(
# ops_test, cos_model_name, cos_address, "jmx_scrape_duration_seconds"
# )

# # Alerts got published to Prometheus
# alerts_data = published_prometheus_alerts(
# ops_test, cos_model_name, cos_address
# )
# logger.info(f"Alerts data: {alerts_data}")

# logger.info("Rules: ")
# for group in alerts_data["data"]["groups"]:
# for rule in group["rules"]:
# logger.info(f"Rule: {rule['name']}")
# logger.info("End of rules.")

# for alert in [
# "Spark History Server Missing",
# "Spark History Server Threads Dead Locked",
# ]:
# assert any(
# rule["name"] == alert
# for group in alerts_data["data"]["groups"]
# for rule in group["rules"]
# )

# # Grafana dashboard got published
# dashboards_info = await published_grafana_dashboards(
# ops_test, cos_model_name
# )
# logger.info(f"Dashboard info {dashboards_info}")
# assert any(
# board["title"] == "Spark History Server JMX Dashboard"
# for board in dashboards_info
# )

# # Loki logs are ingested
# logs = await published_loki_logs(
# ops_test,
# cos_model_name,
# cos_address,
# "juju_application",
# HISTORY_SERVER,
# )
# logger.info(f"Retrieved logs: {logs}")

# # check for non empty logs
# assert len(logs) > 0
# # check if startup messages are there
# c = 0
# for timestamp, message in logs.items():
# if "INFO HistoryServer" in message:
# c = c + 1
# logger.info(f"Number of line found: {c}")
# assert c > 0
@pytest.mark.abort_on_fail
@pytest.mark.asyncio
async def test_history_server_metrics_in_cos(ops_test: OpsTest, cos):
if not cos:
pytest.skip("Not possible to test without cos")

cos_model_name = cos
# Prometheus data is being published by the app
assert await all_prometheus_exporters_data(
ops_test, check_field="jmx_scrape_duration_seconds", app_name=HISTORY_SERVER
)

# We should leave time for Prometheus data to be published
for attempt in Retrying(stop=stop_after_attempt(5), wait=wait_fixed(30)):
with attempt:

cos_address = await get_cos_address(ops_test, cos_model_name=cos_model_name)
assert published_prometheus_data(
ops_test, cos_model_name, cos_address, "jmx_scrape_duration_seconds"
)

# Alerts got published to Prometheus
alerts_data = published_prometheus_alerts(
ops_test, cos_model_name, cos_address
)
logger.info(f"Alerts data: {alerts_data}")

logger.info("Rules: ")
for group in alerts_data["data"]["groups"]:
for rule in group["rules"]:
logger.info(f"Rule: {rule['name']}")
logger.info("End of rules.")

for alert in [
"Spark History Server Missing",
"Spark History Server Threads Dead Locked",
]:
assert any(
rule["name"] == alert
for group in alerts_data["data"]["groups"]
for rule in group["rules"]
)

# Grafana dashboard got published
dashboards_info = await published_grafana_dashboards(
ops_test, cos_model_name
)
logger.info(f"Dashboard info {dashboards_info}")
assert any(
board["title"] == "Spark History Server JMX Dashboard"
for board in dashboards_info
)

# Loki logs are ingested
logs = await published_loki_logs(
ops_test,
cos_model_name,
cos_address,
"juju_application",
HISTORY_SERVER,
)
logger.info(f"Retrieved logs: {logs}")

# check for non empty logs
assert len(logs) > 0
# check if startup messages are there
c = 0
for timestamp, message in logs.items():
if "INFO HistoryServer" in message:
c = c + 1
logger.info(f"Number of line found: {c}")
assert c > 0
122 changes: 51 additions & 71 deletions releases/3.4/terraform/base/applications.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,159 +2,139 @@
# See LICENSE file for licensing details.

resource "juju_application" "history_server" {
name = "history-server"

model = var.model
name = "history-server"
model = var.model

charm {
name = "spark-history-server-k8s"
channel = "3.4/edge"
revision = 30
name = "spark-history-server-k8s"
channel = "3.4/edge"
revision = 33
}

resources = {
spark-history-server-image = 17 # 3.4.2
spark-history-server-image = 17 # 3.4.2
}

units = 1

units = 1
constraints = "arch=amd64"

}

resource "juju_application" "s3" {
name = "s3"

model = var.model
name = "s3"
model = var.model

charm {
name = "s3-integrator"
channel = "latest/edge"
name = "s3-integrator"
channel = "latest/edge"
revision = 17
}

config = {
path = "spark-events"
bucket = var.s3.bucket
endpoint = var.s3.endpoint
path = "spark-events"
bucket = var.s3.bucket
endpoint = var.s3.endpoint
}

units = 1

units = 1
constraints = "arch=amd64"

}


resource "juju_application" "kyuubi" {

name = "kyuubi"

model = var.model
name = "kyuubi"
model = var.model

charm {
name = "kyuubi-k8s"
channel = "latest/edge"
name = "kyuubi-k8s"
channel = "latest/edge"
revision = 27
}

resources = {
kyuubi-image = "ghcr.io/canonical/charmed-spark-kyuubi@sha256:9268d19a6eef91914e874734b320fab64908faf0f7adb8856be809bc60ecd1d0"
kyuubi-image = "ghcr.io/canonical/charmed-spark-kyuubi@sha256:9268d19a6eef91914e874734b320fab64908faf0f7adb8856be809bc60ecd1d0"
}

config = {
namespace = var.model
namespace = var.model
service-account = var.kyuubi_user
}

units = 3
trust = true

units = 3
trust = true
constraints = "arch=amd64"
}

resource "juju_application" "zookeeper" {

name = "zookeeper"

model = var.model
name = "zookeeper"
model = var.model

charm {
name = "zookeeper-k8s"
channel = "3/edge"
name = "zookeeper-k8s"
channel = "3/edge"
revision = 59
}

resources = {
zookeeper-image = 31
}

units = 3
units = 3
constraints = "arch=amd64"
}

resource "juju_application" "kyuubi_users" {
name = "kyuubi-users"

model = var.model
name = "kyuubi-users"
model = var.model

charm {
name = "postgresql-k8s"
channel = "14/stable"
name = "postgresql-k8s"
channel = "14/stable"
revision = 281
}

resources = {
postgresql-image = 159
postgresql-image = 159
}

units = 1
trust = true

units = 1
trust = true
constraints = "arch=amd64"

}

resource "juju_application" "metastore" {
name = "metastore"

model = var.model
name = "metastore"
model = var.model

charm {
name = "postgresql-k8s"
channel = "14/stable"
name = "postgresql-k8s"
channel = "14/stable"
revision = 281
}

resources = {
postgresql-image = 159
postgresql-image = 159
}

units = 1
trust = true

units = 1
trust = true
constraints = "arch=amd64"

}

resource "juju_application" "hub" {
name = "integration-hub"

model = var.model
name = "integration-hub"
model = var.model

charm {
name = "spark-integration-hub-k8s"
channel = "latest/edge"
revision = 20
name = "spark-integration-hub-k8s"
channel = "latest/edge"
revision = 22
}

resources = {
integration-hub-image = 3
integration-hub-image = 3
}

units = 1
trust = true

units = 1
trust = true
constraints = "arch=amd64"

}
}
Loading

0 comments on commit 786c769

Please sign in to comment.