diff --git a/doc/source/admin/pci-passthrough.rst b/doc/source/admin/pci-passthrough.rst index efe4915fb11..c8d94172973 100644 --- a/doc/source/admin/pci-passthrough.rst +++ b/doc/source/admin/pci-passthrough.rst @@ -436,6 +436,12 @@ be added to the resource provider representing the matching PCI devices. (Zed) the nova-compute service will refuse to start with such configuration. It is suggested to use the PCI address of the device instead. +.. important:: + While nova supported configuring :oslo.config:option:`pci.alias` where an + alias name is repeated and therefore associated to multiple alias + specifications, such configuration is not supported when PCI tracking in + Placement is enabled. + The nova-compute service makes sure that existing instances with PCI allocations in the nova DB will have a corresponding PCI allocation in placement. This allocation healing also acts on any new instances regardless of @@ -494,7 +500,9 @@ configuration option supports requesting devices by Placement resource class name via the ``resource_class`` field and also support requesting traits to be present on the selected devices via the ``traits`` field in the alias. If the ``resource_class`` field is not specified in the alias then it is defaulted -by nova to ``CUSTOM_PCI__``. +by nova to ``CUSTOM_PCI__``. Either the ``product_id`` +and ``vendor_id`` or the ``resource_class`` field must be provided in each +alias. For deeper technical details please read the `nova specification. `_ diff --git a/nova/api/openstack/wsgi_app.py b/nova/api/openstack/wsgi_app.py index d6f1030f835..648d529e5f1 100644 --- a/nova/api/openstack/wsgi_app.py +++ b/nova/api/openstack/wsgi_app.py @@ -26,6 +26,7 @@ from nova import context from nova import exception from nova import objects +from nova.pci import request from nova import service from nova import utils from nova import version @@ -51,6 +52,11 @@ def _get_config_files(env=None): def _setup_service(host, name): + + # NOTE(gibi): validate the [pci]alias config early to avoid late failures + # at instance creation due to config errors. + request.get_alias_from_config() + try: utils.raise_if_old_compute() except exception.TooOldComputeService as e: diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 61808588fcf..eb209f1da1e 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1615,6 +1615,10 @@ def init_host(self, service_ref): # if the configuration is wrong. whitelist.Whitelist(CONF.pci.device_spec) + # NOTE(gibi): validate the [pci]alias config early to avoid late + # failures at instance lifecycle operations due to config errors. + pci_req_module.get_alias_from_config() + nova.conf.neutron.register_dynamic_opts(CONF) # Even if only libvirt uses them, make it available for all drivers nova.conf.devices.register_dynamic_opts(CONF) diff --git a/nova/conf/pci.py b/nova/conf/pci.py index ed9e7d4eb00..fd6e4ce012e 100644 --- a/nova/conf/pci.py +++ b/nova/conf/pci.py @@ -70,7 +70,8 @@ alias = { "name": "A16_16A", "device_type": "type-VF", - resource_class: "CUSTOM_A16_16A", + "resource_class": "GPU_VF", + "traits": "blue, big" } Valid key values are : @@ -108,6 +109,8 @@ in the alias is matched against the ``resource_class`` defined in the ``[pci]device_spec``. This field can only be used only if ``[filter_scheduler]pci_in_placement`` is enabled. + Either the product_id and vendor_id or the resource_class field must be + provided in each alias. ``traits`` An optional comma separated list of Placement trait names requested to be diff --git a/nova/pci/request.py b/nova/pci/request.py index fe12bb50fd7..0e61be9a1ea 100644 --- a/nova/pci/request.py +++ b/nova/pci/request.py @@ -37,7 +37,7 @@ These two aliases define a device request meaning: vendor_id is "8086" and product_id is "0442" or "0443". """ - +import functools import typing as ty import jsonschema @@ -121,7 +121,60 @@ } -def _get_alias_from_config() -> Alias: +def _validate_multispec(aliases): + if CONF.filter_scheduler.pci_in_placement: + alias_with_multiple_specs = [ + name for name, spec in aliases.items() if len(spec[1]) > 1] + if alias_with_multiple_specs: + raise exception.PciInvalidAlias( + "The PCI alias(es) %s have multiple specs but " + "[filter_scheduler]pci_in_placement is True. The PCI in " + "Placement feature only supports one spec per alias. You can " + "assign the same resource_class to multiple [pci]device_spec " + "matchers to allow using different devices for the same alias." + % ",".join(alias_with_multiple_specs)) + + +def _validate_required_ids(aliases): + if CONF.filter_scheduler.pci_in_placement: + alias_without_ids_or_rc = set() + for name, alias in aliases.items(): + for spec in alias[1]: + ids = "vendor_id" in spec and "product_id" in spec + rc = "resource_class" in spec + if not ids and not rc: + alias_without_ids_or_rc.add(name) + + if alias_without_ids_or_rc: + raise exception.PciInvalidAlias( + "The PCI alias(es) %s does not have vendor_id and product_id " + "fields set or resource_class field set." + % ",".join(sorted(alias_without_ids_or_rc))) + else: + alias_without_ids = set() + for name, alias in aliases.items(): + for spec in alias[1]: + ids = "vendor_id" in spec and "product_id" in spec + if not ids: + alias_without_ids.add(name) + + if alias_without_ids: + raise exception.PciInvalidAlias( + "The PCI alias(es) %s does not have vendor_id and product_id " + "fields set." + % ",".join(sorted(alias_without_ids))) + + +def _validate_aliases(aliases): + """Checks the parsed aliases for common mistakes and raise easy to parse + error messages + """ + _validate_multispec(aliases) + _validate_required_ids(aliases) + + +@functools.cache +def get_alias_from_config() -> Alias: """Parse and validate PCI aliases from the nova config. :returns: A dictionary where the keys are alias names and the values are @@ -177,6 +230,7 @@ def _get_alias_from_config() -> Alias: except Exception as exc: raise exception.PciInvalidAlias(reason=str(exc)) + _validate_aliases(aliases) return aliases @@ -184,7 +238,7 @@ def _translate_alias_to_requests( alias_spec: str, affinity_policy: ty.Optional[str] = None, ) -> ty.List['objects.InstancePCIRequest']: """Generate complete pci requests from pci aliases in extra_spec.""" - pci_aliases = _get_alias_from_config() + pci_aliases = get_alias_from_config() pci_requests: ty.List[objects.InstancePCIRequest] = [] for name, count in [spec.split(':') for spec in alias_spec.split(',')]: diff --git a/nova/test.py b/nova/test.py index bd2a646141f..02275cdab85 100644 --- a/nova/test.py +++ b/nova/test.py @@ -61,6 +61,7 @@ from nova import exception from nova import objects from nova.objects import base as objects_base +from nova.pci import request from nova import quota from nova.scheduler.client import report from nova.scheduler import utils as scheduler_utils @@ -189,6 +190,10 @@ def setUp(self): self.useFixture( nova_fixtures.PropagateTestCaseIdToChildEventlets(self.id())) + # Ensure that the pci alias is reset between test cases running in + # the same process + request.get_alias_from_config.cache_clear() + # How many of which service we've started. {$service-name: $count} self._service_fixture_count = collections.defaultdict(int) @@ -425,6 +430,10 @@ def flags(self, **kw): group = kw.pop('group', None) for k, v in kw.items(): CONF.set_override(k, v, group) + # loading and validating alias is cached so if it is reconfigured + # we need to reset the cache + if k == 'alias' and group == 'pci': + request.get_alias_from_config.cache_clear() def reset_flags(self, *k, **kw): """Reset flag variables for a test.""" diff --git a/nova/tests/functional/regressions/test_bug_2102038.py b/nova/tests/functional/regressions/test_bug_2102038.py new file mode 100644 index 00000000000..f1f1d13770f --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_2102038.py @@ -0,0 +1,59 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from nova.tests.fixtures import libvirt as fakelibvirt +from nova.tests.functional.api import client +from nova.tests.functional.libvirt import test_pci_in_placement as base + + +class MultipleSpecPerAliasWithPCIInPlacementTest( + base.PlacementPCIReportingTests +): + + def test_alias_with_multiple_specs_not_supported(self): + self.flags(group='filter_scheduler', pci_in_placement=True) + + pci_alias = [ + { + "device_type": "type-VF", + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": "f000", + "name": "a-vf", + }, + { + "device_type": "type-VF", + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": "f001", + "name": "a-vf", + } + ] + self.flags( + group="pci", + alias=self._to_list_of_json_str(pci_alias), + ) + extra_spec = {"pci_passthrough:alias": "a-vf:1"} + flavor_id = self._create_flavor(extra_spec=extra_spec) + + exc = self.assertRaises( + client.OpenStackApiException, + self._create_server, + flavor_id=flavor_id, + networks=[], + ) + self.assertEqual(400, exc.response.status_code) + self.assertIn( + "The PCI alias(es) a-vf have multiple specs but " + "[filter_scheduler]pci_in_placement is True. The PCI in Placement " + "feature only supports one spec per alias. You can assign the " + "same resource_class to multiple [pci]device_spec matchers to " + "allow using different devices for the same alias.", + exc.response.text) diff --git a/nova/tests/functional/regressions/test_bug_2111440.py b/nova/tests/functional/regressions/test_bug_2111440.py new file mode 100644 index 00000000000..44b858724e7 --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_2111440.py @@ -0,0 +1,48 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from nova.tests.functional.api import client +from nova.tests.functional.libvirt import test_pci_in_placement as base + + +class MissingRCAndIdAliasWithPCIInPlacementTest( + base.PlacementPCIReportingTests +): + + def test_alias_without_rc_or_vendor_product_id_is_not_supported(self): + self.flags(group='filter_scheduler', pci_in_placement=True) + + pci_alias = [ + { + "device_type": "type-VF", + "name": "a-vf", + "traits": "foo" + }, + ] + self.flags( + group="pci", + alias=self._to_list_of_json_str(pci_alias), + ) + extra_spec = {"pci_passthrough:alias": "a-vf:1"} + flavor_id = self._create_flavor(extra_spec=extra_spec) + + exc = self.assertRaises( + client.OpenStackApiException, + self._create_server, + flavor_id=flavor_id, + networks=[], + ) + self.assertEqual(400, exc.response.status_code) + self.assertIn( + "The PCI alias(es) a-vf does not have vendor_id and product_id " + "fields set or resource_class field set.", + exc.response.text) diff --git a/nova/tests/unit/api/openstack/test_wsgi_app.py b/nova/tests/unit/api/openstack/test_wsgi_app.py index 7fa1eacc623..b744e4e2694 100644 --- a/nova/tests/unit/api/openstack/test_wsgi_app.py +++ b/nova/tests/unit/api/openstack/test_wsgi_app.py @@ -15,6 +15,7 @@ import fixtures from oslo_config import fixture as config_fixture +from oslo_serialization import jsonutils from oslotest import base from nova.api.openstack import wsgi_app @@ -127,6 +128,14 @@ def test_setup_service_version_workaround(self, mock_check_old, mock_get): group='workarounds') wsgi_app._setup_service('myhost', 'api') + def test_setup_service_pci_alias_validation(self): + wsgi_app.CONF.set_override( + 'alias', jsonutils.dumps({'name': 'foo'}), + group='pci') + self.assertRaises( + exception.PciInvalidAlias, + wsgi_app._setup_service, 'myhost', 'api') + def test__get_config_files_empty_env(self): env = {} result = wsgi_app._get_config_files(env) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 6206fa5bb35..bd26c4dd22c 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -7025,6 +7025,18 @@ def test_init_host_pci_device_spec_validation_failure(self): self.assertRaises(exception.PciDeviceInvalidDeviceName, self.compute.init_host, None) + def test_init_host_pci_alias_validation_failure(self): + # Tests that we fail init_host if the pci.alias is + # configured incorrectly. + self.flags( + alias=[ + jsonutils.dumps({'name': 'foo'}) + ], + group='pci' + ) + self.assertRaises( + exception.PciInvalidAlias, self.compute.init_host, None) + @mock.patch('nova.compute.manager.ComputeManager._instance_update') def test_error_out_instance_on_exception_not_implemented_err(self, inst_update_mock): diff --git a/nova/tests/unit/pci/test_request.py b/nova/tests/unit/pci/test_request.py index be9c6be877e..f1cfc477f68 100644 --- a/nova/tests/unit/pci/test_request.py +++ b/nova/tests/unit/pci/test_request.py @@ -81,7 +81,7 @@ def setUp(self): def test_get_alias_from_config_valid(self): self.flags(alias=[_fake_alias1], group='pci') - result = request._get_alias_from_config() + result = request.get_alias_from_config() expected_result = ( 'legacy', [{ @@ -103,7 +103,7 @@ def test_get_alias_from_config_valid_multispec(self): }) self.flags(alias=[_fake_alias1, _fake_alias], group='pci') - result = request._get_alias_from_config() + result = request.get_alias_from_config() expected_result = ( 'legacy', [{ @@ -120,11 +120,33 @@ def test_get_alias_from_config_valid_multispec(self): }]) self.assertEqual(expected_result, result['QuickAssist']) + def test_get_alias_from_config_multispec_rejected_pci_in_placement(self): + _fake_alias = jsonutils.dumps({ + "name": "QuickAssist", + "capability_type": "pci", + "product_id": "4444", + "vendor_id": "8086", + "device_type": "type-PCI", + }) + + self.flags(pci_in_placement=True, group='filter_scheduler') + self.flags(alias=[_fake_alias1, _fake_alias], group='pci') + + ex = self.assertRaises( + exception.PciInvalidAlias, request.get_alias_from_config) + self.assertEqual( + "The PCI alias(es) QuickAssist have multiple specs but " + "[filter_scheduler]pci_in_placement is True. The PCI in Placement " + "feature only supports one spec per alias. You can assign the " + "same resource_class to multiple [pci]device_spec matchers to " + "allow using different devices for the same alias.", + str(ex)) + def _test_get_alias_from_config_invalid(self, alias): self.flags(alias=[alias], group='pci') self.assertRaises( exception.PciInvalidAlias, - request._get_alias_from_config) + request.get_alias_from_config) def test_get_alias_from_config_invalid_device_type(self): fake_alias = jsonutils.dumps({ @@ -193,7 +215,7 @@ def test_get_alias_from_config_valid_numa_policy(self): "numa_policy": policy, }) self.flags(alias=[fake_alias], group='pci') - aliases = request._get_alias_from_config() + aliases = request.get_alias_from_config() self.assertIsNotNone(aliases) self.assertIn("xxx", aliases) self.assertEqual(policy, aliases["xxx"][0]) @@ -204,8 +226,9 @@ def test_get_alias_from_config_valid_rc_and_traits(self): "resource_class": "foo", "traits": "bar,baz", }) + self.flags(pci_in_placement=True, group='filter_scheduler') self.flags(alias=[fake_alias], group='pci') - aliases = request._get_alias_from_config() + aliases = request.get_alias_from_config() self.assertIsNotNone(aliases) self.assertIn("xxx", aliases) self.assertEqual( @@ -233,7 +256,7 @@ def test_get_alias_from_config_conflicting_device_type(self): self.flags(alias=[fake_alias_a, fake_alias_b], group='pci') self.assertRaises( exception.PciInvalidAlias, - request._get_alias_from_config) + request.get_alias_from_config) def test_get_alias_from_config_conflicting_numa_policy(self): """Check behavior when numa_policy conflicts occur.""" @@ -254,7 +277,87 @@ def test_get_alias_from_config_conflicting_numa_policy(self): self.flags(alias=[fake_alias_a, fake_alias_b], group='pci') self.assertRaises( exception.PciInvalidAlias, - request._get_alias_from_config) + request.get_alias_from_config) + + def test_get_alias_from_config_missing_ids(self): + a1 = jsonutils.dumps({ + "name": "a1", + "product_id": "4444", + }) + a2 = jsonutils.dumps({ + "name": "a2", + "vendor_id": "4444", + }) + a3 = jsonutils.dumps({ + "name": "a3", + }) + a4 = jsonutils.dumps({ + "name": "a4", + # ignored as PCI in Placement is not enabled + "resource_class": "foo", + }) + a5 = jsonutils.dumps({ + "name": "a5", + "vendor_id": "4444", + "product_id": "4444", + }) + self.flags(alias=[a1, a2, a3, a4, a5], group='pci') + + ex = self.assertRaises( + exception.PciInvalidAlias, request.get_alias_from_config) + self.assertEqual( + "The PCI alias(es) a1,a2,a3,a4 does not have vendor_id and " + "product_id fields set.", + str(ex)) + + def test_get_alias_from_config_missing_ids_or_rc_pci_in_placement(self): + a1 = jsonutils.dumps({ + "name": "a1", + "product_id": "4444", + }) + a2 = jsonutils.dumps({ + "name": "a2", + "vendor_id": "4444", + }) + a3 = jsonutils.dumps({ + "name": "a3", + }) + a4 = jsonutils.dumps({ + "name": "a4", + "resource_class": "foo", + }) + a5 = jsonutils.dumps({ + "name": "a5", + "vendor_id": "4444", + "product_id": "4444", + }) + + self.flags(pci_in_placement=True, group='filter_scheduler') + self.flags(alias=[a1, a2, a3, a4, a5], group='pci') + + ex = self.assertRaises( + exception.PciInvalidAlias, request.get_alias_from_config) + self.assertEqual( + "The PCI alias(es) a1,a2,a3 does not have vendor_id and " + "product_id fields set or resource_class field set.", + str(ex)) + + def test_get_alias_from_config_cached(self): + alias = jsonutils.dumps({ + "name": "a5", + "vendor_id": "4444", + "product_id": "4444", + }) + self.flags(alias=[alias], group='pci') + + origi_loads = jsonutils.loads + + with mock.patch('oslo_serialization.jsonutils.loads') as mock_loads: + mock_loads.side_effect = origi_loads + request.get_alias_from_config() + request.get_alias_from_config() + + mock_loads.assert_called_once() def _verify_result(self, expected, real): exp_real = zip(expected, real) diff --git a/nova/tests/unit/virt/libvirt/test_guest.py b/nova/tests/unit/virt/libvirt/test_guest.py index 3034ff0a9d2..5de5021a65c 100644 --- a/nova/tests/unit/virt/libvirt/test_guest.py +++ b/nova/tests/unit/virt/libvirt/test_guest.py @@ -351,6 +351,20 @@ def test_get_device_by_alias(self): self.assertIsNone(self.guest.get_device_by_alias('nope')) + def test_get_device_by_alias_from_persistent_config(self): + with mock.patch.object(self.guest, 'get_all_devices') as mock_get_all: + mock_get_all.return_value = [] + + self.assertIsNone(self.guest.get_device_by_alias( + 'qemu-disk1', + devtype=vconfig.LibvirtConfigGuestDisk, + from_persistent_config=True, + )) + + mock_get_all.assert_called_once_with( + # check if we're querying the persistent config + vconfig.LibvirtConfigGuestDisk, True) + def test_get_devices(self): xml = """ diff --git a/nova/virt/libvirt/guest.py b/nova/virt/libvirt/guest.py index 065181d89e6..309b266b47d 100644 --- a/nova/virt/libvirt/guest.py +++ b/nova/virt/libvirt/guest.py @@ -434,7 +434,7 @@ def get_all_disks(self): def get_device_by_alias(self, devalias, devtype=None, from_persistent_config=False): - for dev in self.get_all_devices(devtype): + for dev in self.get_all_devices(devtype, from_persistent_config): if hasattr(dev, 'alias') and dev.alias == devalias: return dev