Skip to content

Commit

Permalink
Fix network fully occupied by storage-network
Browse files Browse the repository at this point in the history
  • Loading branch information
albinsun committed Aug 1, 2024
1 parent 478921e commit 854330d
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 74 deletions.
49 changes: 31 additions & 18 deletions harvester_e2e_tests/fixtures/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,49 +11,62 @@ class SettingChecker:
def __init__(self):
self.settings = api_client.settings

@wait_until(wait_timeout, sleep_timeout)
def wait_storage_net_configured(self):
code, data = api_client.settings.get('storage-network')
def _storage_net_configured(self):
code, data = self.settings.get('storage-network')
if (cs := data.get('status', {}).get('conditions')):
if 'True' == cs[-1].get('status') and 'Completed' == cs[-1].get('reason'):
return True, (code, data)
return False, (code, data)

@wait_until(wait_timeout, sleep_timeout)
def wait_storage_net_enabled(self):
snet_configured, (code, data) = self._storage_net_configured()
if snet_configured and data.get('value'):
return True, (code, data)
return False, (code, data)

@wait_until(wait_timeout, sleep_timeout)
def wait_storage_net_disabled(self):
snet_configured, (code, data) = self._storage_net_configured()
if snet_configured and not data.get('value'):
return True, (code, data)
return False, (code, data)

@wait_until(wait_timeout, sleep_timeout)
def wait_lh_instance_mgr_running(self):
code, data = api_client.get_pods(namespace='longhorn-system')
if not (code == 200):
return False, (code, data)

lh_im_pods = [p for p in data['data'] if 'instance-manager' in p['id']]
for p in lh_im_pods:
if 'Running' != p['status']['phase']:
return False, (f"Pod {p['id']} is NOT Running", p)
lh_instance_mgrs = [pod for pod in data['data'] if 'instance-manager' in pod['id']]
for imgr in lh_instance_mgrs:
if 'Running' != imgr['status']['phase']:
return False, (f"Pod {imgr['id']} is NOT Running", imgr)

return True, (None, None)

@wait_until(wait_timeout, sleep_timeout)
def wait_lh_storage_net_ready(self, snet_cidr):
ip_range = ip_network(snet_cidr)
def wait_lh_storage_net_enabled(self, snet_cidr):
code, data = api_client.get_pods(namespace='longhorn-system')
if not (code == 200):
return False, (code, data)

lh_im_pods = [p for p in data['data'] if 'instance-manager' in p['id']]
for p in lh_im_pods:
# has dedicated network
lh_instance_mgrs = [pod for pod in data['data'] if 'instance-manager' in pod['id']]
for imgr in lh_instance_mgrs:
# has storage-network dedicated network
nets = json.loads(
imgr['metadata']['annotations']['k8s.v1.cni.cncf.io/network-status']
)
try:
nets = json.loads(
p['metadata']['annotations']['k8s.v1.cni.cncf.io/network-status']
)
dedicated_net = next(n for n in nets if 'lhnet1' == n.get('interface'))
except StopIteration:
return False, (f"Pod {p['id']} has NO 'lhnet1' interface", p)
return False, (f"Pod {imgr['id']} has NO 'lhnet1' interface", imgr)

# network IP is in expected range
# network IP is in expected cidr range
ip_range = ip_network(snet_cidr)
dedicated_ips = dedicated_net.get('ips', ['::1'])
if not all(ip_address(ip) in ip_range for ip in dedicated_ips):
return False, (f"Dedicated IPs {dedicated_ips} does NOT fits {ip_range}", p)
return False, (f"Dedicated IPs {dedicated_ips} does NOT fits {ip_range}", imgr)

return True, (None, None)

Expand Down
9 changes: 6 additions & 3 deletions harvester_e2e_tests/integrations/test_0_storage_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,14 +126,17 @@ def test_storage_network(api_client, cluster_network, vlan_id, unique_name, wait
f"API Status({code}): {data}"
)
_ = api_client.networks.delete(unique_name)
cidr = route['cidr']

# Create storage-network
ip_addr, mask_bits = route['cidr'].split("/")
mask_bits = str(int(mask_bits) + 1) # Use half to avoid occupy whole network
snet_cidr = "/".join([ip_addr, mask_bits])

code, data = api_client.settings.get('storage-network')
assert 200 == code, (code, data)
origin_spec = api_client.settings.Spec.from_dict(data)
spec = api_client.settings.StorageNetworkSpec.enable_with(
vlan_id, cluster_network, cidr
vlan_id, cluster_network, snet_cidr
)
code, data = api_client.settings.update('storage-network', spec)
assert 200 == code, (code, data)
Expand All @@ -153,7 +156,7 @@ def test_storage_network(api_client, cluster_network, vlan_id, unique_name, wait
)

# Verify Longhorn status
done, ip_range = [], ip_network(cidr)
done, ip_range = [], ip_network(snet_cidr)
endtime = datetime.now() + timedelta(seconds=wait_timeout)
while endtime > datetime.now():
code, data = api_client.get_pods(namespace='longhorn-system')
Expand Down
31 changes: 20 additions & 11 deletions harvester_e2e_tests/integrations/test_1_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,31 +163,40 @@ def vlan_cidr(api_client, cluster_network, vlan_id, wait_timeout, sleep_timeout)

@pytest.fixture(scope="class")
def storage_network(api_client, cluster_network, vlan_id, vlan_cidr, setting_checker):
code, data = api_client.settings.get('storage-network')
assert 200 == code, (code, data)
orig_spec = api_client.settings.Spec.from_dict(data)
'''
Note
1. storage-network CIDR is NOT allocatable by other resources.
2. Avoid occupy whole network if the interface is NOT dedicated to storage-network
e.g. interface is also used by VM or cluster.
Ref. https://docs.harvesterhci.io/v1.3/advanced/storagenetwork/#configuration-example
'''
ip_addr, mask_bits = vlan_cidr.split("/")
mask_bits = str(int(mask_bits) + 1) # Use half to avoid occupy whole network
snet_cidr = "/".join([ip_addr, mask_bits])

enable_spec = api_client.settings.StorageNetworkSpec.enable_with(
vlan_id, cluster_network, vlan_cidr
vlan_id, cluster_network, snet_cidr
)
code, data = api_client.settings.update('storage-network', enable_spec)
assert 200 == code, (code, data)

snet_configured, (code, data) = setting_checker.wait_storage_net_configured()
assert snet_configured and data.get('value'), (code, data)
snet_enabled, (code, data) = setting_checker.wait_storage_net_enabled()
assert snet_enabled, (code, data)
lh_imgr_running, (code, data) = setting_checker.wait_lh_instance_mgr_running()
assert lh_imgr_running, (code, data)
lh_snet_ok, (code, data) = setting_checker.wait_lh_storage_net_ready(vlan_cidr)
assert lh_snet_ok, (code, data)
lh_snet_enabled, (code, data) = setting_checker.wait_lh_storage_net_enabled(snet_cidr)
assert lh_snet_enabled, (code, data)

yield

# Teardown
code, data = api_client.settings.update('storage-network', orig_spec)
disable_spec = api_client.settings.StorageNetworkSpec.disable()
code, data = api_client.settings.update('storage-network', disable_spec)
assert 200 == code, (code, data)

snet_configured, (code, data) = setting_checker.wait_storage_net_configured()
assert snet_configured, (code, data)
snet_disabled, (code, data) = setting_checker.wait_storage_net_disabled()
assert snet_disabled, (code, data)
lh_imgr_running, (code, data) = setting_checker.wait_lh_instance_mgr_running()
assert lh_imgr_running, (code, data)

Expand Down
81 changes: 39 additions & 42 deletions harvester_e2e_tests/integrations/test_3_vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,16 @@ def minimal_vm(api_client, gen_unique_name, ubuntu_image, ssh_keypair, vm_checke

@pytest.fixture
def storage_network(api_client, cluster_network, vm_network, setting_checker):
# Restrict subnet range to reduce affection on network
# mask bits 26 satisfies 5 nodes, 2 disks per node, and 10 simultaneously uploading
# Ref. https://docs.harvesterhci.io/v1.1/advanced/storagenetwork/#configuration-example
'''
Note
1. storage-network CIDR is NOT allocatable by other resources.
2. Avoid occupy whole network if the interface is NOT dedicated to storage-network
e.g. interface is also used by VM or cluster.
Ref. https://docs.harvesterhci.io/v1.3/advanced/storagenetwork/#configuration-example
'''
ip_addr, mask_bits = vm_network.cidr.split("/")
mask_bits = '26' if mask_bits < '26' else mask_bits
mask_bits = str(int(mask_bits) + 1) # Use half to avoid occupy whole network
snet_cidr = "/".join([ip_addr, mask_bits])

yield SimpleNamespace(**{
Expand All @@ -168,8 +173,8 @@ def storage_network(api_client, cluster_network, vm_network, setting_checker):
code, data = api_client.settings.update('storage-network', disable_spec)
assert 200 == code, (code, data)

snet_configured, (code, data) = setting_checker.wait_storage_net_configured()
assert snet_configured and not data.get('value'), (code, data)
snet_disabled, (code, data) = setting_checker.wait_storage_net_disabled()
assert snet_disabled, (code, data)
lh_imgr_running, (code, data) = setting_checker.wait_lh_instance_mgr_running()
assert lh_imgr_running, (code, data)

Expand Down Expand Up @@ -204,6 +209,8 @@ def test_multiple_migrations(
break
sleep(5)
else:
for vm_name in vm_names:
api_client.vms.delete(vm_name)
raise AssertionError(
f"Can't find VM {vm_name} with {wait_timeout} timed out\n"
f"Got error: {code}, {data}"
Expand Down Expand Up @@ -238,6 +245,8 @@ def test_multiple_migrations(
break
sleep(5)
else:
for vm_name in vm_names:
api_client.vms.delete(vm_name)
raise AssertionError("\n".join(fails))

# teardown
Expand Down Expand Up @@ -270,7 +279,7 @@ def test_multiple_migrations(
@pytest.mark.p0
@pytest.mark.virtualmachines
def test_migrate_vm_with_user_data(
api_client, unique_name, ubuntu_image, wait_timeout, available_node_names
api_client, unique_name, ubuntu_image, wait_timeout, available_node_names, vm_checker
):
vm_spec = api_client.vms.Spec(1, 1)
vm_spec.add_image('disk-0', ubuntu_image.id)
Expand All @@ -296,6 +305,7 @@ def test_migrate_vm_with_user_data(
break
sleep(5)
else:
vm_checker.wait_deleted(unique_name)
raise AssertionError(
f"Can't find VM {unique_name} with {wait_timeout} timed out\n"
f"Got error: {code}, {data}"
Expand All @@ -320,24 +330,15 @@ def test_migrate_vm_with_user_data(
break
sleep(5)
else:
vm_checker.wait_deleted(unique_name)
raise AssertionError(
f"The migration of VM {unique_name} is not completed with {wait_timeout} timed out"
f"Got error: {code}, {data}"
)

# teardown
api_client.vms.delete(unique_name)
endtime = datetime.now() + timedelta(seconds=wait_timeout)
while endtime > datetime.now():
code, data = api_client.vms.get_status(unique_name)
if code == 404:
break
sleep(5)
else:
raise AssertionError(
f"VM {unique_name} can't be deleted with {wait_timeout} timed out"
f"Got error: {code}, {data}"
)
vm_deleted, (code, data) = vm_checker.wait_deleted(unique_name)
assert vm_deleted, (code, data)

for vol in api_client.vms.Spec.from_dict(vm_data).volumes:
if vol['volume'].get('persistentVolumeClaim', {}).get('claimName', "") != "":
Expand All @@ -347,10 +348,10 @@ def test_migrate_vm_with_user_data(
@pytest.mark.p0
@pytest.mark.virtualmachines
def test_migrate_vm_with_multiple_volumes(
api_client, unique_name, ubuntu_image, wait_timeout, available_node_names
api_client, unique_name, ubuntu_image, wait_timeout, available_node_names, vm_checker
):
vm_spec = api_client.vms.Spec(1, 1)
vm_spec.add_image('disk-0', ubuntu_image['id'])
vm_spec.add_image('disk-0', ubuntu_image.id)
vm_spec.add_volume('disk-1', 1)
code, vm_data = api_client.vms.create(unique_name, vm_spec)
assert code == 201, (
Expand All @@ -368,6 +369,7 @@ def test_migrate_vm_with_multiple_volumes(
break
sleep(5)
else:
vm_checker.wait_deleted(unique_name)
raise AssertionError(
f"Can't find VM {unique_name} with {wait_timeout} timed out\n"
f"Got error: {code}, {data}"
Expand All @@ -392,33 +394,24 @@ def test_migrate_vm_with_multiple_volumes(
break
sleep(5)
else:
vm_checker.wait_deleted(unique_name)
raise AssertionError(
f"The migration of VM {unique_name} is not completed with {wait_timeout} timed out"
f"Got error: {code}, {data}"
)

# teardown
api_client.vms.delete(unique_name)
endtime = datetime.now() + timedelta(seconds=wait_timeout)
while endtime > datetime.now():
code, data = api_client.vms.get_status(unique_name)
if code == 404:
break
sleep(5)
else:
raise AssertionError(
f"VM {unique_name} can't be deleted with {wait_timeout} timed out"
f"Got error: {code}, {data}"
)
vm_deleted, (code, data) = vm_checker.wait_deleted(unique_name)
assert vm_deleted, (code, data)

for vol in api_client.vms.Spec.from_dict(vm_data).volumes:
if vol['volume'].get('persistentVolumeClaim', {}).get('claimName', "") != "":
api_client.volumes.delete(vol['volume']['persistentVolumeClaim']['claimName'])


@pytest.mark.p0
@pytest.mark.settings
@pytest.mark.networks
@pytest.mark.settings
@pytest.mark.virtualmachines
@pytest.mark.skip_version_if("< v1.0.3")
class TestVMWithStorageNetwork:
Expand Down Expand Up @@ -451,12 +444,14 @@ def test_enable_storage_network_with_api_stopped_vm(
code, data = api_client.settings.update('storage-network', storage_network.enable_spec)
assert 200 == code, (code, data)

snet_configured, (code, data) = setting_checker.wait_storage_net_configured()
assert snet_configured and data.get('value'), (code, data)
snet_enabled, (code, data) = setting_checker.wait_storage_net_enabled()
assert snet_enabled, (code, data)
lh_imgr_running, (code, data) = setting_checker.wait_lh_instance_mgr_running()
assert lh_imgr_running, (code, data)
lh_snet_ok, (code, data) = setting_checker.wait_lh_storage_net_ready(storage_network.cidr)
assert lh_snet_ok, (code, data)
lh_snet_enabled, (code, data) = setting_checker.wait_lh_storage_net_enabled(
storage_network.cidr
)
assert lh_snet_enabled, (code, data)

def test_enable_storage_network_with_cli_stopped_vm(
self, api_client, ssh_keypair, minimal_vm, storage_network, setting_checker,
Expand Down Expand Up @@ -501,9 +496,11 @@ def test_enable_storage_network_with_cli_stopped_vm(
code, data = api_client.settings.update('storage-network', storage_network.enable_spec)
assert 200 == code, (code, data)

snet_configured, (code, data) = setting_checker.wait_storage_net_configured()
assert snet_configured and data.get('value'), (code, data)
snet_enabled, (code, data) = setting_checker.wait_storage_net_enabled()
assert snet_enabled, (code, data)
lh_imgr_running, (code, data) = setting_checker.wait_lh_instance_mgr_running()
assert lh_imgr_running, (code, data)
lh_snet_ok, (code, data) = setting_checker.wait_lh_storage_net_ready(storage_network.cidr)
assert lh_snet_ok, (code, data)
lh_snet_enabled, (code, data) = setting_checker.wait_lh_storage_net_enabled(
storage_network.cidr
)
assert lh_snet_enabled, (code, data)

0 comments on commit 854330d

Please sign in to comment.