-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DPE-4799] Fix upgrade bugs when switching charms #268
Changes from all commits
b1e241b
b7e628e
5589420
4c80b2c
1c914cf
dfdf44a
d2b8b91
2d2cf85
a95f137
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Defer early exit events that may trigger cfg reload and dir creation during upgrade |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,7 +25,7 @@ | |
from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer | ||
from jinja2 import Template | ||
from ops import JujuVersion | ||
from ops.charm import CharmBase | ||
from ops.charm import CharmBase, StartEvent | ||
from ops.main import main | ||
from ops.model import ( | ||
ActiveStatus, | ||
|
@@ -396,11 +396,17 @@ def update_config(self) -> bool: | |
|
||
return True | ||
|
||
def _on_start(self, _) -> None: | ||
def _on_start(self, event: StartEvent) -> None: | ||
"""On Start hook. | ||
|
||
Runs pgbouncer through systemd (configured in src/pgbouncer.service) | ||
""" | ||
# Safeguard against starting while upgrading. | ||
if not self.upgrade.idle: | ||
logger.debug("Defer on_start: Cluster is upgrading") | ||
event.defer() | ||
return | ||
|
||
# Done first to instantiate the snap's private tmp | ||
self.unit.set_workload_version(self.version) | ||
|
||
|
@@ -481,6 +487,10 @@ def _on_update_status(self, _) -> None: | |
Sets BlockedStatus if we have no backend database; if we can't connect to a backend, this | ||
charm serves no purpose. | ||
""" | ||
if not self.upgrade.idle: | ||
logger.debug("Early exit on_update_status: Cluster is upgrading") | ||
return | ||
|
||
Comment on lines
+490
to
+493
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Early exit update status, since it will retrigger eventually. Not strictly necessary, but if upgrade is stuck or failed it will overwrite the status. |
||
self.update_status() | ||
|
||
self.peers.update_leader() | ||
|
@@ -508,6 +518,11 @@ def _on_config_changed(self, event) -> None: | |
Reads charm config values, generates derivative values, writes new pgbouncer config, and | ||
restarts pgbouncer to apply changes. | ||
""" | ||
if not self.upgrade.idle: | ||
logger.debug("Defer on_config_changed: Cluster is upgrading") | ||
event.defer() | ||
return | ||
|
||
old_port = self.peers.app_databag.get("current_port") | ||
port_changed = old_port != str(self.config["listen_port"]) | ||
if port_changed and self._is_exposed: | ||
|
@@ -529,6 +544,7 @@ def _on_config_changed(self, event) -> None: | |
logger.warning("Deferring on_config_changed: cannot set secret label") | ||
event.defer() | ||
return | ||
|
||
if self.backend.postgres: | ||
self.render_prometheus_service() | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -152,7 +152,7 @@ async def test_fail_and_rollback(ops_test, continuous_writes, pgb_charm_jammy) - | |
application = ops_test.model.applications[PGB] | ||
|
||
logger.info("Refresh the charm") | ||
await application.refresh(path=pgb_charm_jammy) | ||
await application.refresh(path=fault_charm) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We weren't using the correct charm and hitting other blocked statuses. |
||
|
||
logger.info("Wait for upgrade to fail") | ||
async with ops_test.fast_forward("60s"): | ||
|
@@ -171,6 +171,12 @@ async def test_fail_and_rollback(ops_test, continuous_writes, pgb_charm_jammy) - | |
logger.info("Re-refresh the charm") | ||
await application.refresh(path=pgb_charm_jammy) | ||
|
||
logger.info("Wait for upgrade to start") | ||
await ops_test.model.block_until( | ||
lambda: "waiting" in {unit.workload_status for unit in application.units}, | ||
timeout=TIMEOUT, | ||
) | ||
|
||
logger.info("Wait for application to recover") | ||
async with ops_test.fast_forward("60s"): | ||
await ops_test.model.wait_for_idle(apps=[PGB], status="active", timeout=TIMEOUT) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#!/usr/bin/env python3 | ||
# Copyright 2024 Canonical Ltd. | ||
# See LICENSE file for licensing details. | ||
import asyncio | ||
import logging | ||
|
||
import pytest | ||
from pytest_operator.plugin import OpsTest | ||
|
||
from constants import BACKEND_RELATION_NAME | ||
|
||
from .helpers.helpers import ( | ||
PG, | ||
PGB, | ||
) | ||
from .helpers.postgresql_helpers import get_leader_unit | ||
from .relations.pgbouncer_provider.helpers import ( | ||
check_exposed_connection, | ||
fetch_action_get_credentials, | ||
) | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
TIMEOUT = 600 | ||
DATA_INTEGRATOR_APP_NAME = "data-integrator" | ||
|
||
|
||
@pytest.mark.group(1) | ||
@pytest.mark.abort_on_fail | ||
async def test_deploy_stable(ops_test: OpsTest, pgb_charm_jammy) -> None: | ||
"""Simple test to ensure that the PostgreSQL and application charms get deployed.""" | ||
await asyncio.gather( | ||
ops_test.model.deploy( | ||
PG, | ||
num_units=3, | ||
channel="14/edge", | ||
config={"profile": "testing"}, | ||
), | ||
ops_test.model.deploy( | ||
PGB, | ||
channel="1/stable", | ||
num_units=None, | ||
), | ||
ops_test.model.deploy( | ||
DATA_INTEGRATOR_APP_NAME, | ||
num_units=2, | ||
channel="latest/edge", | ||
config={"database-name": "test-database"}, | ||
), | ||
) | ||
logger.info("Wait for applications to become active") | ||
|
||
await ops_test.model.add_relation(f"{PGB}:{BACKEND_RELATION_NAME}", f"{PG}:database") | ||
await ops_test.model.add_relation(DATA_INTEGRATOR_APP_NAME, PGB) | ||
async with ops_test.fast_forward(): | ||
await ops_test.model.wait_for_idle( | ||
apps=[PG, PGB, DATA_INTEGRATOR_APP_NAME], status="active", timeout=1500 | ||
) | ||
assert len(ops_test.model.applications[PG].units) == 3 | ||
assert len(ops_test.model.applications[PGB].units) == 2 | ||
|
||
|
||
@pytest.mark.group(1) | ||
@pytest.mark.abort_on_fail | ||
async def test_pre_upgrade_check(ops_test: OpsTest) -> None: | ||
"""Test that the pre-upgrade-check action runs successfully.""" | ||
logger.info("Get leader unit") | ||
leader_unit = await get_leader_unit(ops_test, PGB) | ||
assert leader_unit is not None, "No leader unit found" | ||
|
||
logger.info("Run pre-upgrade-check action") | ||
action = await leader_unit.run_action("pre-upgrade-check") | ||
await action.wait() | ||
|
||
|
||
@pytest.mark.group(1) | ||
@pytest.mark.abort_on_fail | ||
async def test_upgrade_from_stable(ops_test: OpsTest, pgb_charm_jammy): | ||
"""Test updating from stable channel.""" | ||
credentials = await fetch_action_get_credentials( | ||
ops_test.model.applications[DATA_INTEGRATOR_APP_NAME].units[0] | ||
) | ||
check_exposed_connection(credentials, False) | ||
|
||
application = ops_test.model.applications[PGB] | ||
actions = await application.get_actions() | ||
|
||
logger.info("Refresh the charm") | ||
await application.refresh(path=pgb_charm_jammy) | ||
|
||
logger.info("Wait for upgrade to start") | ||
await ops_test.model.block_until( | ||
lambda: ("waiting" if "pre-upgrade-check" in actions else "maintenance") | ||
in {unit.workload_status for unit in application.units}, | ||
timeout=TIMEOUT, | ||
) | ||
Comment on lines
+91
to
+96
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lambda didn't work for me locally, when upgrading a single unit. |
||
|
||
logger.info("Wait for upgrade to complete") | ||
async with ops_test.fast_forward("60s"): | ||
await ops_test.model.wait_for_idle( | ||
apps=[PGB], status="active", idle_period=30, timeout=TIMEOUT | ||
) | ||
|
||
credentials = await fetch_action_get_credentials( | ||
ops_test.model.applications[DATA_INTEGRATOR_APP_NAME].units[0] | ||
) | ||
check_exposed_connection(credentials, False) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Planned units don't work on subordinate charms
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do not see PR for LIBPATCH = 18: https://github.com/canonical/data-platform-libs/pulls
You can shoot your leg.
We need subordinate test on dpe-lib... as LIBPATCH=17 should notice this regression.
CC: @marceloneppel : canonical/data-platform-libs#176
Proposal: copy tests from data-integrator to dpe-libs? Yeah, yeah... shared tests...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's make sure the changes work here first. Rollback doesn't seem to work anymore in this PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Started a draft for the lib changes: canonical/data-platform-libs#179
However, there are issues with CI there and it will take me time to fix them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I executed some tests. All passed, as I commented on canonical/data-platform-libs#179 (review).