Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

loosen version upgrade checks to allow for lts version upgrades #31079

Merged
merged 2 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion ci/nightly/pipeline.template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -876,6 +876,18 @@ steps:
composition: platform-checks
args: [--scenario=UpgradeEntireMz, "--seed=$BUILDKITE_JOB_ID"]

- id: checks-lts-upgrade
label: "Checks LTS upgrade, whole-Mz restart"
depends_on: build-aarch64
timeout_in_minutes: 180
parallelism: 2
agents:
queue: hetzner-aarch64-16cpu-32gb
plugins:
- ./ci/plugins/mzcompose:
composition: platform-checks
args: [--scenario=UpgradeEntireMzFromLatestLTS, "--seed=$BUILDKITE_JOB_ID"]

- id: checks-preflight-check-rollback
label: "Checks preflight-check and roll back upgrade"
depends_on: build-aarch64
Expand Down Expand Up @@ -1676,7 +1688,7 @@ steps:
plugins:
- ./ci/plugins/mzcompose:
composition: legacy-upgrade
args: ["--versions-source=docs"]
args: ["--versions-source=docs", "--lts-upgrade"]
agents:
queue: hetzner-aarch64-4cpu-8gb

Expand Down
36 changes: 35 additions & 1 deletion misc/python/materialize/checks/scenarios_upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from materialize.checks.scenarios import Scenario
from materialize.mz_version import MzVersion
from materialize.mzcompose.services.materialized import LEADER_STATUS_HEALTHCHECK
from materialize.version_list import get_published_minor_mz_versions
from materialize.version_list import LTS_VERSIONS, get_published_minor_mz_versions

# late initialization
_minor_versions: list[MzVersion] | None = None
Expand Down Expand Up @@ -83,6 +83,40 @@ def start_mz_read_only(
)


class UpgradeEntireMzFromLatestLTS(Scenario):
"""Upgrade the entire Mz instance from the last LTS version without any intermediate steps. This makes sure our LTS releases for self-managed Materialize stay upgradable."""

def base_version(self) -> MzVersion:
return LTS_VERSIONS[-1]

def actions(self) -> list[Action]:
print(f"Upgrading from tag {self.base_version()}")
return [
StartMz(
self,
tag=self.base_version(),
),
Initialize(self),
Manipulate(self, phase=1),
KillMz(
capture_logs=True
), # We always use True here otherwise docker-compose will lose the pre-upgrade logs
StartMz(
self,
tag=None,
),
Manipulate(self, phase=2),
Validate(self),
# A second restart while already on the new version
KillMz(capture_logs=True),
StartMz(
self,
tag=None,
),
Validate(self),
]


class UpgradeEntireMz(Scenario):
"""Upgrade the entire Mz instance from the last released version."""

Expand Down
5 changes: 5 additions & 0 deletions misc/python/materialize/version_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@

MZ_ROOT = Path(os.environ["MZ_ROOT"])

LTS_VERSIONS = [
MzVersion.parse_mz("v0.130.1"), # v25.1.0
# Put new versions at the bottom
]

# not released on Docker
INVALID_VERSIONS = {
MzVersion.parse_mz("v0.52.1"),
Expand Down
48 changes: 47 additions & 1 deletion src/persist-client/src/cfg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ use crate::operators::STORAGE_SOURCE_DECODE_FUEL;
use crate::project::OPTIMIZE_IGNORED_DATA_DECODE;
use crate::read::READER_LEASE_DURATION;

const LTS_VERSIONS: &[Version] = &[
// 25.1
Version::new(0, 130, 0),
Copy link
Contributor

@def- def- Jan 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the patch number matter here? (It shouldn't because we might have to bump it a few times)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, patch version shouldn't matter.

];

/// The tunable knobs for persist.
///
/// Tuning inputs:
Expand Down Expand Up @@ -609,6 +614,10 @@ impl BlobKnobs for PersistConfig {
}
}

pub fn check_data_version(code_version: &Version, data_version: &Version) -> Result<(), String> {
check_data_version_with_lts_versions(code_version, data_version, LTS_VERSIONS)
}

// If persist gets some encoded ProtoState from the future (e.g. two versions of
// code are running simultaneously against the same shard), it might have a
// field that the current code doesn't know about. This would be silently
Expand All @@ -633,7 +642,44 @@ impl BlobKnobs for PersistConfig {
// data we read is going to be because we fetched it using a pointer stored in
// some persist state. If we can handle the state, we can handle the blobs it
// references, too.
pub fn check_data_version(code_version: &Version, data_version: &Version) -> Result<(), String> {
pub(crate) fn check_data_version_with_lts_versions(
code_version: &Version,
data_version: &Version,
lts_versions: &[Version],
) -> Result<(), String> {
// Allow upgrades specifically between consecutive LTS releases.
let base_code_version = Version {
patch: 0,
..code_version.clone()
};
let base_data_version = Version {
patch: 0,
..data_version.clone()
};
if data_version >= code_version {
for window in lts_versions.windows(2) {
if base_code_version == window[0] && base_data_version <= window[1] {
return Ok(());
}
}

if let Some(last) = lts_versions.last() {
bkirwi marked this conversation as resolved.
Show resolved Hide resolved
if base_code_version == *last
// kind of arbitrary, but just ensure we don't accidentally
// upgrade too far (the previous check should ensure that a
// new version won't take over from a too-old previous
// version, but we want to make sure the other side also
// doesn't get confused)
&& base_data_version
.minor
.saturating_sub(base_code_version.minor)
< 40
{
return Ok(());
}
}
}

// Allow one minor version of forward compatibility. We could avoid the
// clone with some nested comparisons of the semver fields, but this code
// isn't particularly performance sensitive and I find this impl easier to
Expand Down
146 changes: 146 additions & 0 deletions src/persist-client/src/internal/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2061,6 +2061,152 @@ mod tests {
proptest!(|(state in any_state::<u64>(0..3))| testcase(state));
}

#[mz_ore::test]
fn check_data_versions_with_lts_versions() {
#[track_caller]
fn testcase(code: &str, data: &str, lts_versions: &[Version], expected: Result<(), ()>) {
let code = Version::parse(code).unwrap();
let data = Version::parse(data).unwrap();
let actual = cfg::check_data_version_with_lts_versions(&code, &data, lts_versions)
.map_err(|_| ());
assert_eq!(actual, expected);
}

let none = [];
let one = [Version::new(0, 130, 0)];
let two = [Version::new(0, 130, 0), Version::new(0, 140, 0)];
let three = [
Version::new(0, 130, 0),
Version::new(0, 140, 0),
Version::new(0, 150, 0),
];

testcase("0.130.0", "0.128.0", &none, Ok(()));
testcase("0.130.0", "0.129.0", &none, Ok(()));
testcase("0.130.0", "0.130.0", &none, Ok(()));
testcase("0.130.0", "0.130.1", &none, Ok(()));
testcase("0.130.1", "0.130.0", &none, Ok(()));
testcase("0.130.0", "0.131.0", &none, Ok(()));
testcase("0.130.0", "0.132.0", &none, Err(()));

testcase("0.129.0", "0.127.0", &none, Ok(()));
testcase("0.129.0", "0.128.0", &none, Ok(()));
testcase("0.129.0", "0.129.0", &none, Ok(()));
testcase("0.129.0", "0.129.1", &none, Ok(()));
testcase("0.129.1", "0.129.0", &none, Ok(()));
testcase("0.129.0", "0.130.0", &none, Ok(()));
testcase("0.129.0", "0.131.0", &none, Err(()));

testcase("0.130.0", "0.128.0", &one, Ok(()));
testcase("0.130.0", "0.129.0", &one, Ok(()));
testcase("0.130.0", "0.130.0", &one, Ok(()));
testcase("0.130.0", "0.130.1", &one, Ok(()));
testcase("0.130.1", "0.130.0", &one, Ok(()));
testcase("0.130.0", "0.131.0", &one, Ok(()));
testcase("0.130.0", "0.132.0", &one, Ok(()));

testcase("0.129.0", "0.127.0", &one, Ok(()));
testcase("0.129.0", "0.128.0", &one, Ok(()));
testcase("0.129.0", "0.129.0", &one, Ok(()));
testcase("0.129.0", "0.129.1", &one, Ok(()));
testcase("0.129.1", "0.129.0", &one, Ok(()));
testcase("0.129.0", "0.130.0", &one, Ok(()));
testcase("0.129.0", "0.131.0", &one, Err(()));

testcase("0.131.0", "0.129.0", &one, Ok(()));
testcase("0.131.0", "0.130.0", &one, Ok(()));
testcase("0.131.0", "0.131.0", &one, Ok(()));
testcase("0.131.0", "0.131.1", &one, Ok(()));
testcase("0.131.1", "0.131.0", &one, Ok(()));
testcase("0.131.0", "0.132.0", &one, Ok(()));
testcase("0.131.0", "0.133.0", &one, Err(()));

testcase("0.130.0", "0.128.0", &two, Ok(()));
testcase("0.130.0", "0.129.0", &two, Ok(()));
testcase("0.130.0", "0.130.0", &two, Ok(()));
testcase("0.130.0", "0.130.1", &two, Ok(()));
testcase("0.130.1", "0.130.0", &two, Ok(()));
testcase("0.130.0", "0.131.0", &two, Ok(()));
testcase("0.130.0", "0.132.0", &two, Ok(()));
testcase("0.130.0", "0.135.0", &two, Ok(()));
testcase("0.130.0", "0.138.0", &two, Ok(()));
testcase("0.130.0", "0.139.0", &two, Ok(()));
testcase("0.130.0", "0.140.0", &two, Ok(()));
testcase("0.130.9", "0.140.0", &two, Ok(()));
testcase("0.130.0", "0.140.1", &two, Ok(()));
testcase("0.130.3", "0.140.1", &two, Ok(()));
testcase("0.130.3", "0.140.9", &two, Ok(()));
testcase("0.130.0", "0.141.0", &two, Err(()));
testcase("0.129.0", "0.133.0", &two, Err(()));
testcase("0.129.0", "0.140.0", &two, Err(()));
testcase("0.131.0", "0.133.0", &two, Err(()));
testcase("0.131.0", "0.140.0", &two, Err(()));

testcase("0.130.0", "0.128.0", &three, Ok(()));
testcase("0.130.0", "0.129.0", &three, Ok(()));
testcase("0.130.0", "0.130.0", &three, Ok(()));
testcase("0.130.0", "0.130.1", &three, Ok(()));
testcase("0.130.1", "0.130.0", &three, Ok(()));
testcase("0.130.0", "0.131.0", &three, Ok(()));
testcase("0.130.0", "0.132.0", &three, Ok(()));
testcase("0.130.0", "0.135.0", &three, Ok(()));
testcase("0.130.0", "0.138.0", &three, Ok(()));
testcase("0.130.0", "0.139.0", &three, Ok(()));
testcase("0.130.0", "0.140.0", &three, Ok(()));
testcase("0.130.9", "0.140.0", &three, Ok(()));
testcase("0.130.0", "0.140.1", &three, Ok(()));
testcase("0.130.3", "0.140.1", &three, Ok(()));
testcase("0.130.3", "0.140.9", &three, Ok(()));
testcase("0.130.0", "0.141.0", &three, Err(()));
testcase("0.129.0", "0.133.0", &three, Err(()));
testcase("0.129.0", "0.140.0", &three, Err(()));
testcase("0.131.0", "0.133.0", &three, Err(()));
testcase("0.131.0", "0.140.0", &three, Err(()));
testcase("0.130.0", "0.150.0", &three, Err(()));

testcase("0.140.0", "0.138.0", &three, Ok(()));
testcase("0.140.0", "0.139.0", &three, Ok(()));
testcase("0.140.0", "0.140.0", &three, Ok(()));
testcase("0.140.0", "0.140.1", &three, Ok(()));
testcase("0.140.1", "0.140.0", &three, Ok(()));
testcase("0.140.0", "0.141.0", &three, Ok(()));
testcase("0.140.0", "0.142.0", &three, Ok(()));
testcase("0.140.0", "0.145.0", &three, Ok(()));
testcase("0.140.0", "0.148.0", &three, Ok(()));
testcase("0.140.0", "0.149.0", &three, Ok(()));
testcase("0.140.0", "0.150.0", &three, Ok(()));
testcase("0.140.9", "0.150.0", &three, Ok(()));
testcase("0.140.0", "0.150.1", &three, Ok(()));
testcase("0.140.3", "0.150.1", &three, Ok(()));
testcase("0.140.3", "0.150.9", &three, Ok(()));
testcase("0.140.0", "0.151.0", &three, Err(()));
testcase("0.139.0", "0.143.0", &three, Err(()));
testcase("0.139.0", "0.150.0", &three, Err(()));
testcase("0.141.0", "0.143.0", &three, Err(()));
testcase("0.141.0", "0.150.0", &three, Err(()));

testcase("0.150.0", "0.148.0", &three, Ok(()));
testcase("0.150.0", "0.149.0", &three, Ok(()));
testcase("0.150.0", "0.150.0", &three, Ok(()));
testcase("0.150.0", "0.150.1", &three, Ok(()));
testcase("0.150.1", "0.150.0", &three, Ok(()));
testcase("0.150.0", "0.151.0", &three, Ok(()));
testcase("0.150.0", "0.152.0", &three, Ok(()));
testcase("0.150.0", "0.155.0", &three, Ok(()));
testcase("0.150.0", "0.158.0", &three, Ok(()));
testcase("0.150.0", "0.159.0", &three, Ok(()));
testcase("0.150.0", "0.160.0", &three, Ok(()));
testcase("0.150.9", "0.160.0", &three, Ok(()));
testcase("0.150.0", "0.160.1", &three, Ok(()));
testcase("0.150.3", "0.160.1", &three, Ok(()));
testcase("0.150.3", "0.160.9", &three, Ok(()));
testcase("0.150.0", "0.161.0", &three, Ok(()));
testcase("0.149.0", "0.153.0", &three, Err(()));
testcase("0.149.0", "0.160.0", &three, Err(()));
testcase("0.151.0", "0.153.0", &three, Err(()));
testcase("0.151.0", "0.160.0", &three, Err(()));
}

#[mz_ore::test]
fn check_data_versions() {
#[track_caller]
Expand Down
17 changes: 16 additions & 1 deletion test/legacy-upgrade/mzcompose.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from materialize.mzcompose.services.testdrive import Testdrive
from materialize.mzcompose.services.zookeeper import Zookeeper
from materialize.version_list import (
LTS_VERSIONS,
VersionsFromDocs,
get_all_published_mz_versions,
get_published_minor_mz_versions,
Expand Down Expand Up @@ -80,6 +81,7 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
help="from what source to fetch the versions",
)
parser.add_argument("--ignore-missing-version", action="store_true")
parser.add_argument("--lts-upgrade", action="store_true")
args = parser.parse_args()

parallelism_index = buildkite.get_parallelism_index()
Expand Down Expand Up @@ -131,6 +133,18 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
test_upgrade_from_version(
c, "current_source", priors=[], filter=args.filter, zero_downtime=False
)
if args.lts_upgrade:
# Direct upgrade from latest LTS version without any inbetween versions
version = LTS_VERSIONS[-1]
priors = [v for v in all_versions if v <= version]
test_upgrade_from_version(
c,
f"{version}",
priors,
filter=args.filter,
zero_downtime=False,
lts_upgrade=True,
)


def get_all_and_latest_two_minor_mz_versions(
Expand All @@ -152,6 +166,7 @@ def test_upgrade_from_version(
priors: list[MzVersion],
filter: str,
zero_downtime: bool,
lts_upgrade: bool = False,
) -> None:
print(
f"+++ Testing {'0dt upgrade' if zero_downtime else 'regular upgrade'} from Materialize {from_version} to current_source."
Expand Down Expand Up @@ -241,7 +256,7 @@ def test_upgrade_from_version(
c.kill(mz_service)
c.rm(mz_service, "testdrive")

if from_version != "current_source":
if from_version != "current_source" and not lts_upgrade:
# We can't skip in-between minor versions anymore, so go through all of them
for version in get_published_minor_mz_versions(newest_first=False):
if version <= from_version:
Expand Down
Loading