From 982b261b87f8d956f6df7506bcfc209ac2fdc665 Mon Sep 17 00:00:00 2001 From: Jay Vosburgh Date: Fri, 1 Nov 2019 21:56:42 -0700 Subject: [PATCH 001/155] bonding: fix state transition issue in link monitoring [ Upstream commit 1899bb325149e481de31a4f32b59ea6f24e176ea ] Since de77ecd4ef02 ("bonding: improve link-status update in mii-monitoring"), the bonding driver has utilized two separate variables to indicate the next link state a particular slave should transition to. Each is used to communicate to a different portion of the link state change commit logic; one to the bond_miimon_commit function itself, and another to the state transition logic. Unfortunately, the two variables can become unsynchronized, resulting in incorrect link state transitions within bonding. This can cause slaves to become stuck in an incorrect link state until a subsequent carrier state transition. The issue occurs when a special case in bond_slave_netdev_event sets slave->link directly to BOND_LINK_FAIL. On the next pass through bond_miimon_inspect after the slave goes carrier up, the BOND_LINK_FAIL case will set the proposed next state (link_new_state) to BOND_LINK_UP, but the new_link to BOND_LINK_DOWN. The setting of the final link state from new_link comes after that from link_new_state, and so the slave will end up incorrectly in _DOWN state. Resolve this by combining the two variables into one. Reported-by: Aleksei Zakharov Reported-by: Sha Zhang Cc: Mahesh Bandewar Fixes: de77ecd4ef02 ("bonding: improve link-status update in mii-monitoring") Signed-off-by: Jay Vosburgh Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/bonding/bond_main.c | 41 ++++++++++++++++----------------- include/net/bonding.h | 3 +-- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 1cc4c99aa504b7..cf8385a22de59b 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2055,8 +2055,7 @@ static int bond_miimon_inspect(struct bonding *bond) ignore_updelay = !rcu_dereference(bond->curr_active_slave); bond_for_each_slave_rcu(bond, slave, iter) { - slave->new_link = BOND_LINK_NOCHANGE; - slave->link_new_state = slave->link; + bond_propose_link_state(slave, BOND_LINK_NOCHANGE); link_state = bond_check_dev_link(bond, slave->dev, 0); @@ -2092,7 +2091,7 @@ static int bond_miimon_inspect(struct bonding *bond) } if (slave->delay <= 0) { - slave->new_link = BOND_LINK_DOWN; + bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; continue; } @@ -2131,7 +2130,7 @@ static int bond_miimon_inspect(struct bonding *bond) slave->delay = 0; if (slave->delay <= 0) { - slave->new_link = BOND_LINK_UP; + bond_propose_link_state(slave, BOND_LINK_UP); commit++; ignore_updelay = false; continue; @@ -2151,7 +2150,7 @@ static void bond_miimon_commit(struct bonding *bond) struct slave *slave, *primary; bond_for_each_slave(bond, slave, iter) { - switch (slave->new_link) { + switch (slave->link_new_state) { case BOND_LINK_NOCHANGE: /* For 802.3ad mode, check current slave speed and * duplex again in case its port was disabled after @@ -2244,8 +2243,8 @@ static void bond_miimon_commit(struct bonding *bond) default: netdev_err(bond->dev, "invalid new link %d on slave %s\n", - slave->new_link, slave->dev->name); - slave->new_link = BOND_LINK_NOCHANGE; + slave->link_new_state, slave->dev->name); + bond_propose_link_state(slave, BOND_LINK_NOCHANGE); continue; } @@ -2644,13 +2643,13 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) bond_for_each_slave_rcu(bond, slave, iter) { unsigned long trans_start = dev_trans_start(slave->dev); - slave->new_link = BOND_LINK_NOCHANGE; + bond_propose_link_state(slave, BOND_LINK_NOCHANGE); if (slave->link != BOND_LINK_UP) { if (bond_time_in_interval(bond, trans_start, 1) && bond_time_in_interval(bond, slave->last_rx, 1)) { - slave->new_link = BOND_LINK_UP; + bond_propose_link_state(slave, BOND_LINK_UP); slave_state_changed = 1; /* primary_slave has no meaning in round-robin @@ -2677,7 +2676,7 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) if (!bond_time_in_interval(bond, trans_start, 2) || !bond_time_in_interval(bond, slave->last_rx, 2)) { - slave->new_link = BOND_LINK_DOWN; + bond_propose_link_state(slave, BOND_LINK_DOWN); slave_state_changed = 1; if (slave->link_failure_count < UINT_MAX) @@ -2709,8 +2708,8 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) goto re_arm; bond_for_each_slave(bond, slave, iter) { - if (slave->new_link != BOND_LINK_NOCHANGE) - slave->link = slave->new_link; + if (slave->link_new_state != BOND_LINK_NOCHANGE) + slave->link = slave->link_new_state; } if (slave_state_changed) { @@ -2733,9 +2732,9 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) } /* Called to inspect slaves for active-backup mode ARP monitor link state - * changes. Sets new_link in slaves to specify what action should take - * place for the slave. Returns 0 if no changes are found, >0 if changes - * to link states must be committed. + * changes. Sets proposed link state in slaves to specify what action + * should take place for the slave. Returns 0 if no changes are found, >0 + * if changes to link states must be committed. * * Called with rcu_read_lock held. */ @@ -2747,12 +2746,12 @@ static int bond_ab_arp_inspect(struct bonding *bond) int commit = 0; bond_for_each_slave_rcu(bond, slave, iter) { - slave->new_link = BOND_LINK_NOCHANGE; + bond_propose_link_state(slave, BOND_LINK_NOCHANGE); last_rx = slave_last_rx(bond, slave); if (slave->link != BOND_LINK_UP) { if (bond_time_in_interval(bond, last_rx, 1)) { - slave->new_link = BOND_LINK_UP; + bond_propose_link_state(slave, BOND_LINK_UP); commit++; } continue; @@ -2780,7 +2779,7 @@ static int bond_ab_arp_inspect(struct bonding *bond) if (!bond_is_active_slave(slave) && !rcu_access_pointer(bond->current_arp_slave) && !bond_time_in_interval(bond, last_rx, 3)) { - slave->new_link = BOND_LINK_DOWN; + bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; } @@ -2793,7 +2792,7 @@ static int bond_ab_arp_inspect(struct bonding *bond) if (bond_is_active_slave(slave) && (!bond_time_in_interval(bond, trans_start, 2) || !bond_time_in_interval(bond, last_rx, 2))) { - slave->new_link = BOND_LINK_DOWN; + bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; } } @@ -2813,7 +2812,7 @@ static void bond_ab_arp_commit(struct bonding *bond) struct slave *slave; bond_for_each_slave(bond, slave, iter) { - switch (slave->new_link) { + switch (slave->link_new_state) { case BOND_LINK_NOCHANGE: continue; @@ -2866,7 +2865,7 @@ static void bond_ab_arp_commit(struct bonding *bond) default: netdev_err(bond->dev, "impossible: new_link %d on slave %s\n", - slave->new_link, slave->dev->name); + slave->link_new_state, slave->dev->name); continue; } diff --git a/include/net/bonding.h b/include/net/bonding.h index 04008209506a12..b0f20bc0fd4ab1 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -149,7 +149,6 @@ struct slave { unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS]; s8 link; /* one of BOND_LINK_XXXX */ s8 link_new_state; /* one of BOND_LINK_XXXX */ - s8 new_link; u8 backup:1, /* indicates backup slave. Value corresponds with BOND_STATE_ACTIVE and BOND_STATE_BACKUP */ inactive:1, /* indicates inactive slave */ @@ -523,7 +522,7 @@ static inline void bond_propose_link_state(struct slave *slave, int state) static inline void bond_commit_link_state(struct slave *slave, bool notify) { - if (slave->link == slave->link_new_state) + if (slave->link_new_state == BOND_LINK_NOCHANGE) return; slave->link = slave->link_new_state; From 0a94a5d5bb078cbdbf1b596ce4dedbba4ea67fcd Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 7 Nov 2019 09:48:01 +0100 Subject: [PATCH 002/155] CDC-NCM: handle incomplete transfer of MTU [ Upstream commit 332f989a3b0041b810836c5c3747e59aad7e9d0b ] A malicious device may give half an answer when asked for its MTU. The driver will proceed after this with a garbage MTU. Anything but a complete answer must be treated as an error. V2: used sizeof as request by Alexander Reported-and-tested-by: syzbot+0631d878823ce2411636@syzkaller.appspotmail.com Signed-off-by: Oliver Neukum Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/cdc_ncm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index ab28487e604840..d53b4a41c583a1 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -578,8 +578,8 @@ static void cdc_ncm_set_dgram_size(struct usbnet *dev, int new_size) /* read current mtu value from device */ err = usbnet_read_cmd(dev, USB_CDC_GET_MAX_DATAGRAM_SIZE, USB_TYPE_CLASS | USB_DIR_IN | USB_RECIP_INTERFACE, - 0, iface_no, &max_datagram_size, 2); - if (err < 0) { + 0, iface_no, &max_datagram_size, sizeof(max_datagram_size)); + if (err < sizeof(max_datagram_size)) { dev_dbg(&dev->intf->dev, "GET_MAX_DATAGRAM_SIZE failed\n"); goto out; } @@ -590,7 +590,7 @@ static void cdc_ncm_set_dgram_size(struct usbnet *dev, int new_size) max_datagram_size = cpu_to_le16(ctx->max_datagram_size); err = usbnet_write_cmd(dev, USB_CDC_SET_MAX_DATAGRAM_SIZE, USB_TYPE_CLASS | USB_DIR_OUT | USB_RECIP_INTERFACE, - 0, iface_no, &max_datagram_size, 2); + 0, iface_no, &max_datagram_size, sizeof(max_datagram_size)); if (err < 0) dev_dbg(&dev->intf->dev, "SET_MAX_DATAGRAM_SIZE failed\n"); From b0b4aca6d67d3ee3ed859d9df8bcd100328fb05c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 7 Nov 2019 18:29:52 +0000 Subject: [PATCH 003/155] ipv4: Fix table id reference in fib_sync_down_addr [ Upstream commit e0a312629fefa943534fc46f7bfbe6de3fdaf463 ] Hendrik reported routes in the main table using source address are not removed when the address is removed. The problem is that fib_sync_down_addr does not account for devices in the default VRF which are associated with the main table. Fix by updating the table id reference. Fixes: 5a56a0b3a45d ("net: Don't delete routes in different VRFs") Reported-by: Hendrik Donner Signed-off-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/fib_semantics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e76b8a7bb89114..eff703cb13b6f2 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1471,8 +1471,8 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) int ret = 0; unsigned int hash = fib_laddr_hashfn(local); struct hlist_head *head = &fib_info_laddrhash[hash]; + int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; struct net *net = dev_net(dev); - int tb_id = l3mdev_fib_table(dev); struct fib_info *fi; if (!fib_info_laddrhash || local == 0) From cd6be628d78a5a7d97789883b2e69b3529d213b4 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Fri, 8 Nov 2019 10:00:44 +0000 Subject: [PATCH 004/155] net: ethernet: octeon_mgmt: Account for second possible VLAN header [ Upstream commit e4dd5608033efe7b6030cde359bfdbaeb73bc22d ] Octeon's input ring-buffer entry has 14 bits-wide size field, so to account for second possible VLAN header max_mtu must be further reduced. Fixes: 109cc16526c6d ("ethernet/cavium: use core min/max MTU checking") Signed-off-by: Alexander Sverdlin Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/cavium/octeon/octeon_mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c index 45c51277e0cf09..61701ba2ac72a3 100644 --- a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c +++ b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c @@ -1497,7 +1497,7 @@ static int octeon_mgmt_probe(struct platform_device *pdev) netdev->ethtool_ops = &octeon_mgmt_ethtool_ops; netdev->min_mtu = 64 - OCTEON_MGMT_RX_HEADROOM; - netdev->max_mtu = 16383 - OCTEON_MGMT_RX_HEADROOM; + netdev->max_mtu = 16383 - OCTEON_MGMT_RX_HEADROOM - VLAN_HLEN; mac = of_get_mac_address(pdev->dev.of_node); From 5ec98a56092582835ecb8c552a8da48d1b2bf5d2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 20:08:19 -0800 Subject: [PATCH 005/155] net: fix data-race in neigh_event_send() [ Upstream commit 1b53d64435d56902fc234ff2507142d971a09687 ] KCSAN reported the following data-race [1] The fix will also prevent the compiler from optimizing out the condition. [1] BUG: KCSAN: data-race in neigh_resolve_output / neigh_resolve_output write to 0xffff8880a41dba78 of 8 bytes by interrupt on cpu 1: neigh_event_send include/net/neighbour.h:443 [inline] neigh_resolve_output+0x78/0x480 net/core/neighbour.c:1474 neigh_output include/net/neighbour.h:511 [inline] ip_finish_output2+0x4af/0xe40 net/ipv4/ip_output.c:228 __ip_finish_output net/ipv4/ip_output.c:308 [inline] __ip_finish_output+0x23a/0x490 net/ipv4/ip_output.c:290 ip_finish_output+0x41/0x160 net/ipv4/ip_output.c:318 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip_output+0xdf/0x210 net/ipv4/ip_output.c:432 dst_output include/net/dst.h:436 [inline] ip_local_out+0x74/0x90 net/ipv4/ip_output.c:125 __ip_queue_xmit+0x3a8/0xa40 net/ipv4/ip_output.c:532 ip_queue_xmit+0x45/0x60 include/net/ip.h:237 __tcp_transmit_skb+0xe81/0x1d60 net/ipv4/tcp_output.c:1169 tcp_transmit_skb net/ipv4/tcp_output.c:1185 [inline] __tcp_retransmit_skb+0x4bd/0x15f0 net/ipv4/tcp_output.c:2976 tcp_retransmit_skb+0x36/0x1a0 net/ipv4/tcp_output.c:2999 tcp_retransmit_timer+0x719/0x16d0 net/ipv4/tcp_timer.c:515 tcp_write_timer_handler+0x42d/0x510 net/ipv4/tcp_timer.c:598 tcp_write_timer+0xd1/0xf0 net/ipv4/tcp_timer.c:618 read to 0xffff8880a41dba78 of 8 bytes by interrupt on cpu 0: neigh_event_send include/net/neighbour.h:442 [inline] neigh_resolve_output+0x57/0x480 net/core/neighbour.c:1474 neigh_output include/net/neighbour.h:511 [inline] ip_finish_output2+0x4af/0xe40 net/ipv4/ip_output.c:228 __ip_finish_output net/ipv4/ip_output.c:308 [inline] __ip_finish_output+0x23a/0x490 net/ipv4/ip_output.c:290 ip_finish_output+0x41/0x160 net/ipv4/ip_output.c:318 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip_output+0xdf/0x210 net/ipv4/ip_output.c:432 dst_output include/net/dst.h:436 [inline] ip_local_out+0x74/0x90 net/ipv4/ip_output.c:125 __ip_queue_xmit+0x3a8/0xa40 net/ipv4/ip_output.c:532 ip_queue_xmit+0x45/0x60 include/net/ip.h:237 __tcp_transmit_skb+0xe81/0x1d60 net/ipv4/tcp_output.c:1169 tcp_transmit_skb net/ipv4/tcp_output.c:1185 [inline] __tcp_retransmit_skb+0x4bd/0x15f0 net/ipv4/tcp_output.c:2976 tcp_retransmit_skb+0x36/0x1a0 net/ipv4/tcp_output.c:2999 tcp_retransmit_timer+0x719/0x16d0 net/ipv4/tcp_timer.c:515 tcp_write_timer_handler+0x42d/0x510 net/ipv4/tcp_timer.c:598 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.4.0-rc3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/neighbour.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 393099b1901ade..1d6b98119a1d60 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -429,8 +429,8 @@ static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { unsigned long now = jiffies; - if (neigh->used != now) - neigh->used = now; + if (READ_ONCE(neigh->used) != now) + WRITE_ONCE(neigh->used, now); if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) return __neigh_event_send(neigh, skb); return 0; From 79acc2c403c24de693709b7344df748b6441757c Mon Sep 17 00:00:00 2001 From: Sean Tranchetti Date: Mon, 4 Nov 2019 17:54:22 -0700 Subject: [PATCH 006/155] net: qualcomm: rmnet: Fix potential UAF when unregistering [ Upstream commit e7a86c687e64ab24f88330ad24ecc9442ce40c5a ] During the exit/unregistration process of the RmNet driver, the function rmnet_unregister_real_device() is called to handle freeing the driver's internal state and removing the RX handler on the underlying physical device. However, the order of operations this function performs is wrong and can lead to a use after free of the rmnet_port structure. Before calling netdev_rx_handler_unregister(), this port structure is freed with kfree(). If packets are received on any RmNet devices before synchronize_net() completes, they will attempt to use this already-freed port structure when processing the packet. As such, before cleaning up any other internal state, the RX handler must be unregistered in order to guarantee that no further packets will arrive on the device. Fixes: ceed73a2cf4a ("drivers: net: ethernet: qualcomm: rmnet: Initial implementation") Signed-off-by: Sean Tranchetti Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 1e33aea59f505d..7d8303e45f090b 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -84,10 +84,10 @@ static int rmnet_unregister_real_device(struct net_device *real_dev, if (port->nr_rmnet_devs) return -EINVAL; - kfree(port); - netdev_rx_handler_unregister(real_dev); + kfree(port); + /* release reference on real_dev */ dev_put(real_dev); From c7c24046c8e970fdda5c6f3998b728fb7f0b49df Mon Sep 17 00:00:00 2001 From: Aleksander Morgado Date: Thu, 7 Nov 2019 11:57:01 +0100 Subject: [PATCH 007/155] net: usb: qmi_wwan: add support for DW5821e with eSIM support [ Upstream commit e497df686e8fed8c1dd69179010656362858edb3 ] Exactly same layout as the default DW5821e module, just a different vid/pid. The QMI interface is exposed in USB configuration #1: P: Vendor=413c ProdID=81e0 Rev=03.18 S: Manufacturer=Dell Inc. S: Product=DW5821e-eSIM Snapdragon X20 LTE S: SerialNumber=0123456789ABCDEF C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#=0x0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan I: If#=0x1 Alt= 0 #EPs= 1 Cls=03(HID ) Sub=00 Prot=00 Driver=usbhid I: If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option Signed-off-by: Aleksander Morgado Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index e406a05e79dcdb..8ed538295d0900 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1286,6 +1286,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x413c, 0x81b6, 8)}, /* Dell Wireless 5811e */ {QMI_FIXED_INTF(0x413c, 0x81b6, 10)}, /* Dell Wireless 5811e */ {QMI_FIXED_INTF(0x413c, 0x81d7, 0)}, /* Dell Wireless 5821e */ + {QMI_FIXED_INTF(0x413c, 0x81e0, 0)}, /* Dell Wireless 5821e with eSIM support*/ {QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)}, /* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */ {QMI_FIXED_INTF(0x03f0, 0x9d1d, 1)}, /* HP lt4120 Snapdragon X5 LTE */ {QMI_FIXED_INTF(0x22de, 0x9061, 3)}, /* WeTelecom WPD-600N */ From 5f311e37e9592bbfebac148135ed39ede2ad0d10 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Tue, 5 Nov 2019 16:34:07 +0800 Subject: [PATCH 008/155] NFC: fdp: fix incorrect free object [ Upstream commit 517ce4e93368938b204451285e53014549804868 ] The address of fw_vsc_cfg is on stack. Releasing it with devm_kfree() is incorrect, which may result in a system crash or other security impacts. The expected object to free is *fw_vsc_cfg. Signed-off-by: Pan Bian Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/nfc/fdp/i2c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nfc/fdp/i2c.c b/drivers/nfc/fdp/i2c.c index c4da50e07bbcbe..4020c11a9415b8 100644 --- a/drivers/nfc/fdp/i2c.c +++ b/drivers/nfc/fdp/i2c.c @@ -267,7 +267,7 @@ static void fdp_nci_i2c_read_device_properties(struct device *dev, *fw_vsc_cfg, len); if (r) { - devm_kfree(dev, fw_vsc_cfg); + devm_kfree(dev, *fw_vsc_cfg); goto vsc_read_err; } } else { From 6c234cb7fc9d948e2edb2c22804e14ea598640fb Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Thu, 7 Nov 2019 14:29:50 +0800 Subject: [PATCH 009/155] nfc: netlink: fix double device reference drop [ Upstream commit 025ec40b81d785a98f76b8bdb509ac10773b4f12 ] The function nfc_put_device(dev) is called twice to drop the reference to dev when there is no associated local llcp. Remove one of them to fix the bug. Fixes: 52feb444a903 ("NFC: Extend netlink interface for LTO, RW, and MIUX parameters support") Fixes: d9b8d8e19b07 ("NFC: llcp: Service Name Lookup netlink interface") Signed-off-by: Pan Bian Reviewed-by: Johan Hovold Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/nfc/netlink.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 5f2acd029da5fc..7b8d4d235a3a14 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1100,7 +1100,6 @@ static int nfc_genl_llc_set_params(struct sk_buff *skb, struct genl_info *info) local = nfc_llcp_find_local(dev); if (!local) { - nfc_put_device(dev); rc = -ENODEV; goto exit; } @@ -1160,7 +1159,6 @@ static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) local = nfc_llcp_find_local(dev); if (!local) { - nfc_put_device(dev); rc = -ENODEV; goto exit; } From 12e3d9b35c0e9cd12e1c2027344a1ca99ca2044d Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Thu, 7 Nov 2019 09:33:20 +0800 Subject: [PATCH 010/155] NFC: st21nfca: fix double free [ Upstream commit 99a8efbb6e30b72ac98cecf81103f847abffb1e5 ] The variable nfcid_skb is not changed in the callee nfc_hci_get_param() if error occurs. Consequently, the freed variable nfcid_skb will be freed again, resulting in a double free bug. Set nfcid_skb to NULL after releasing it to fix the bug. Signed-off-by: Pan Bian Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/nfc/st21nfca/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nfc/st21nfca/core.c b/drivers/nfc/st21nfca/core.c index e803fdfa918977..f37069b53b2003 100644 --- a/drivers/nfc/st21nfca/core.c +++ b/drivers/nfc/st21nfca/core.c @@ -719,6 +719,7 @@ static int st21nfca_hci_complete_target_discovered(struct nfc_hci_dev *hdev, NFC_PROTO_FELICA_MASK; } else { kfree_skb(nfcid_skb); + nfcid_skb = NULL; /* P2P in type A */ r = nfc_hci_get_param(hdev, ST21NFCA_RF_READER_F_GATE, ST21NFCA_RF_READER_F_NFCID1, From a49ecc872d61a90a7ab78513113ffd124b29576a Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Fri, 8 Nov 2019 02:42:30 -0800 Subject: [PATCH 011/155] qede: fix NULL pointer deref in __qede_remove() [ Upstream commit deabc87111c690097c03765ea017cd500f7376fc ] While rebooting the system with SR-IOV vfs enabled leads to below crash due to recurrence of __qede_remove() on the VF devices (first from .shutdown() flow of the VF itself and another from PF's .shutdown() flow executing pci_disable_sriov()) This patch adds a safeguard in __qede_remove() flow to fix this, so that driver doesn't attempt to remove "already removed" devices. [ 194.360134] BUG: unable to handle kernel NULL pointer dereference at 00000000000008dc [ 194.360227] IP: [] __qede_remove+0x24/0x130 [qede] [ 194.360304] PGD 0 [ 194.360325] Oops: 0000 [#1] SMP [ 194.360360] Modules linked in: tcp_lp fuse tun bridge stp llc devlink bonding ip_set nfnetlink ib_isert iscsi_target_mod ib_srpt target_core_mod ib_srp scsi_transport_srp scsi_tgt ib_ipoib ib_umad rpcrdma sunrpc rdma_ucm ib_uverbs ib_iser rdma_cm iw_cm ib_cm libiscsi scsi_transport_iscsi dell_smbios iTCO_wdt iTCO_vendor_support dell_wmi_descriptor dcdbas vfat fat pcc_cpufreq skx_edac intel_powerclamp coretemp intel_rapl iosf_mbi kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd qedr ib_core pcspkr ses enclosure joydev ipmi_ssif sg i2c_i801 lpc_ich mei_me mei wmi ipmi_si ipmi_devintf ipmi_msghandler tpm_crb acpi_pad acpi_power_meter xfs libcrc32c sd_mod crc_t10dif crct10dif_generic crct10dif_pclmul crct10dif_common crc32c_intel mgag200 [ 194.361044] qede i2c_algo_bit drm_kms_helper qed syscopyarea sysfillrect nvme sysimgblt fb_sys_fops ttm nvme_core mpt3sas crc8 ptp drm pps_core ahci raid_class scsi_transport_sas libahci libata drm_panel_orientation_quirks nfit libnvdimm dm_mirror dm_region_hash dm_log dm_mod [last unloaded: ip_tables] [ 194.361297] CPU: 51 PID: 7996 Comm: reboot Kdump: loaded Not tainted 3.10.0-1062.el7.x86_64 #1 [ 194.361359] Hardware name: Dell Inc. PowerEdge MX840c/0740HW, BIOS 2.4.6 10/15/2019 [ 194.361412] task: ffff9cea9b360000 ti: ffff9ceabebdc000 task.ti: ffff9ceabebdc000 [ 194.361463] RIP: 0010:[] [] __qede_remove+0x24/0x130 [qede] [ 194.361534] RSP: 0018:ffff9ceabebdfac0 EFLAGS: 00010282 [ 194.361570] RAX: 0000000000000000 RBX: ffff9cd013846098 RCX: 0000000000000000 [ 194.361621] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff9cd013846098 [ 194.361668] RBP: ffff9ceabebdfae8 R08: 0000000000000000 R09: 0000000000000000 [ 194.361715] R10: 00000000bfe14201 R11: ffff9ceabfe141e0 R12: 0000000000000000 [ 194.361762] R13: ffff9cd013846098 R14: 0000000000000000 R15: ffff9ceab5e48000 [ 194.361810] FS: 00007f799c02d880(0000) GS:ffff9ceacb0c0000(0000) knlGS:0000000000000000 [ 194.361865] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 194.361903] CR2: 00000000000008dc CR3: 0000001bdac76000 CR4: 00000000007607e0 [ 194.361953] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 194.362002] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 194.362051] PKRU: 55555554 [ 194.362073] Call Trace: [ 194.362109] [] qede_remove+0x10/0x20 [qede] [ 194.362180] [] pci_device_remove+0x3e/0xc0 [ 194.362240] [] __device_release_driver+0x82/0xf0 [ 194.362285] [] device_release_driver+0x23/0x30 [ 194.362343] [] pci_stop_bus_device+0x84/0xa0 [ 194.362388] [] pci_stop_and_remove_bus_device+0x12/0x20 [ 194.362450] [] pci_iov_remove_virtfn+0xaf/0x160 [ 194.362496] [] sriov_disable+0x3c/0xf0 [ 194.362534] [] pci_disable_sriov+0x23/0x30 [ 194.362599] [] qed_sriov_disable+0x5e3/0x650 [qed] [ 194.362658] [] ? kfree+0x106/0x140 [ 194.362709] [] ? qed_free_stream_mem+0x70/0x90 [qed] [ 194.362754] [] ? kfree+0x106/0x140 [ 194.362803] [] qed_slowpath_stop+0x1a9/0x1d0 [qed] [ 194.362854] [] __qede_remove+0xae/0x130 [qede] [ 194.362904] [] qede_shutdown+0x10/0x20 [qede] [ 194.362956] [] pci_device_shutdown+0x3a/0x60 [ 194.363010] [] device_shutdown+0xfb/0x1f0 [ 194.363066] [] kernel_restart_prepare+0x36/0x40 [ 194.363107] [] kernel_restart+0x12/0x60 [ 194.363146] [] SYSC_reboot+0x229/0x260 [ 194.363196] [] ? handle_mm_fault+0x39d/0x9b0 [ 194.363253] [] ? __switch_to+0x151/0x580 [ 194.363304] [] ? __schedule+0x448/0x9c0 [ 194.363343] [] SyS_reboot+0xe/0x10 [ 194.363387] [] system_call_fastpath+0x25/0x2a [ 194.363430] Code: f9 e9 37 ff ff ff 90 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 41 55 4c 8d af 98 00 00 00 41 54 4c 89 ef 41 89 f4 53 e8 4c e4 55 f9 <80> b8 dc 08 00 00 01 48 89 c3 4c 8d b8 c0 08 00 00 4c 8b b0 c0 [ 194.363712] RIP [] __qede_remove+0x24/0x130 [qede] [ 194.363764] RSP [ 194.363791] CR2: 00000000000008dc Signed-off-by: Manish Chopra Signed-off-by: Ariel Elior Signed-off-by: Sudarsana Kalluru Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/qlogic/qede/qede_main.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 6eab2c632c759c..dab202f343c622 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1052,8 +1052,16 @@ enum qede_remove_mode { static void __qede_remove(struct pci_dev *pdev, enum qede_remove_mode mode) { struct net_device *ndev = pci_get_drvdata(pdev); - struct qede_dev *edev = netdev_priv(ndev); - struct qed_dev *cdev = edev->cdev; + struct qede_dev *edev; + struct qed_dev *cdev; + + if (!ndev) { + dev_info(&pdev->dev, "Device has already been removed\n"); + return; + } + + edev = netdev_priv(ndev); + cdev = edev->cdev; DP_INFO(edev, "Starting qede_remove\n"); From f536bdb834c0c54743721e7e5000d0006671b848 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 6 Nov 2019 17:55:47 +0100 Subject: [PATCH 012/155] ALSA: timer: Fix incorrectly assigned timer instance commit e7af6307a8a54f0b873960b32b6a644f2d0fbd97 upstream. The clean up commit 41672c0c24a6 ("ALSA: timer: Simplify error path in snd_timer_open()") unified the error handling code paths with the standard goto, but it introduced a subtle bug: the timer instance is stored in snd_timer_open() incorrectly even if it returns an error. This may eventually lead to UAF, as spotted by fuzzer. The culprit is the snd_timer_open() code checks the SNDRV_TIMER_IFLG_EXCLUSIVE flag with the common variable timeri. This variable is supposed to be the newly created instance, but we (ab-)used it for a temporary check before the actual creation of a timer instance. After that point, there is another check for the max number of instances, and it bails out if over the threshold. Before the refactoring above, it worked fine because the code returned directly from that point. After the refactoring, however, it jumps to the unified error path that stores the timeri variable in return -- even if it returns an error. Unfortunately this stored value is kept in the caller side (snd_timer_user_tselect()) in tu->timeri. This causes inconsistency later, as if the timer was successfully assigned. In this patch, we fix it by not re-using timeri variable but a temporary variable for testing the exclusive connection, so timeri remains NULL at that point. Fixes: 41672c0c24a6 ("ALSA: timer: Simplify error path in snd_timer_open()") Reported-and-tested-by: Tristan Madani Cc: Link: https://lore.kernel.org/r/20191106165547.23518-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/timer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/core/timer.c b/sound/core/timer.c index 161ab19cb72204..c60dfd52e8a6dc 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -298,11 +298,11 @@ int snd_timer_open(struct snd_timer_instance **ti, goto unlock; } if (!list_empty(&timer->open_list_head)) { - timeri = list_entry(timer->open_list_head.next, + struct snd_timer_instance *t = + list_entry(timer->open_list_head.next, struct snd_timer_instance, open_list); - if (timeri->flags & SNDRV_TIMER_IFLG_EXCLUSIVE) { + if (t->flags & SNDRV_TIMER_IFLG_EXCLUSIVE) { err = -EBUSY; - timeri = NULL; goto unlock; } } From 2e3f0caf9f62735a5d997003bf1e17bedd0014d6 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sun, 3 Nov 2019 00:09:20 +0900 Subject: [PATCH 013/155] ALSA: bebob: fix to detect configured source of sampling clock for Focusrite Saffire Pro i/o series commit 706ad6746a66546daf96d4e4a95e46faf6cf689a upstream. For Focusrite Saffire Pro i/o, the lowest 8 bits of register represents configured source of sampling clock. The next lowest 8 bits represents whether the configured source is actually detected or not just after the register is changed for the source. Current implementation evaluates whole the register to detect configured source. This results in failure due to the next lowest 8 bits when the source is connected in advance. This commit fixes the bug. Fixes: 25784ec2d034 ("ALSA: bebob: Add support for Focusrite Saffire/SaffirePro series") Cc: # v3.16+ Signed-off-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20191102150920.20367-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/firewire/bebob/bebob_focusrite.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/firewire/bebob/bebob_focusrite.c b/sound/firewire/bebob/bebob_focusrite.c index 52b8b61ecddd2d..62d989edd1293f 100644 --- a/sound/firewire/bebob/bebob_focusrite.c +++ b/sound/firewire/bebob/bebob_focusrite.c @@ -28,6 +28,8 @@ #define SAFFIRE_CLOCK_SOURCE_SPDIF 1 /* clock sources as returned from register of Saffire Pro 10 and 26 */ +#define SAFFIREPRO_CLOCK_SOURCE_SELECT_MASK 0x000000ff +#define SAFFIREPRO_CLOCK_SOURCE_DETECT_MASK 0x0000ff00 #define SAFFIREPRO_CLOCK_SOURCE_INTERNAL 0 #define SAFFIREPRO_CLOCK_SOURCE_SKIP 1 /* never used on hardware */ #define SAFFIREPRO_CLOCK_SOURCE_SPDIF 2 @@ -190,6 +192,7 @@ saffirepro_both_clk_src_get(struct snd_bebob *bebob, unsigned int *id) map = saffirepro_clk_maps[1]; /* In a case that this driver cannot handle the value of register. */ + value &= SAFFIREPRO_CLOCK_SOURCE_SELECT_MASK; if (value >= SAFFIREPRO_CLOCK_SOURCE_COUNT || map[value] < 0) { err = -EIO; goto end; From bd4bb87222365ce1161bf78409d5a66d0637926d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 5 Nov 2019 14:43:16 +0100 Subject: [PATCH 014/155] ALSA: hda/ca0132 - Fix possible workqueue stall commit 15c2b3cc09a31620914955cb2a89c277c18ee999 upstream. The unsolicited event handler for the headphone jack on CA0132 codec driver tries to reschedule the another delayed work with cancel_delayed_work_sync(). It's no good idea, unfortunately, especially after we changed the work queue to the standard global one; this may lead to a stall because both works are using the same global queue. Fix it by dropping the _sync but does call cancel_delayed_work() instead. Fixes: 993884f6a26c ("ALSA: hda/ca0132 - Delay HP amp turnon.") BugLink: https://bugzilla.suse.com/show_bug.cgi?id=1155836 Cc: Link: https://lore.kernel.org/r/20191105134316.19294-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_ca0132.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_ca0132.c b/sound/pci/hda/patch_ca0132.c index 119f3b504765ef..9876d8dc2edea2 100644 --- a/sound/pci/hda/patch_ca0132.c +++ b/sound/pci/hda/patch_ca0132.c @@ -4440,7 +4440,7 @@ static void hp_callback(struct hda_codec *codec, struct hda_jack_callback *cb) /* Delay enabling the HP amp, to let the mic-detection * state machine run. */ - cancel_delayed_work_sync(&spec->unsol_hp_work); + cancel_delayed_work(&spec->unsol_hp_work); schedule_delayed_work(&spec->unsol_hp_work, msecs_to_jiffies(500)); tbl = snd_hda_jack_tbl_get(codec, cb->nid); if (tbl) From d25eb9d6b723afa23e818892c026adc2021ac8cd Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 5 Nov 2019 21:16:30 -0800 Subject: [PATCH 015/155] mm: thp: handle page cache THP correctly in PageTransCompoundMap commit 169226f7e0d275c1879551f37484ef6683579a5c upstream. We have a usecase to use tmpfs as QEMU memory backend and we would like to take the advantage of THP as well. But, our test shows the EPT is not PMD mapped even though the underlying THP are PMD mapped on host. The number showed by /sys/kernel/debug/kvm/largepage is much less than the number of PMD mapped shmem pages as the below: 7f2778200000-7f2878200000 rw-s 00000000 00:14 262232 /dev/shm/qemu_back_mem.mem.Hz2hSf (deleted) Size: 4194304 kB [snip] AnonHugePages: 0 kB ShmemPmdMapped: 579584 kB [snip] Locked: 0 kB cat /sys/kernel/debug/kvm/largepages 12 And some benchmarks do worse than with anonymous THPs. By digging into the code we figured out that commit 127393fbe597 ("mm: thp: kvm: fix memory corruption in KVM with THP enabled") checks if there is a single PTE mapping on the page for anonymous THP when setting up EPT map. But the _mapcount < 0 check doesn't work for page cache THP since every subpage of page cache THP would get _mapcount inc'ed once it is PMD mapped, so PageTransCompoundMap() always returns false for page cache THP. This would prevent KVM from setting up PMD mapped EPT entry. So we need handle page cache THP correctly. However, when page cache THP's PMD gets split, kernel just remove the map instead of setting up PTE map like what anonymous THP does. Before KVM calls get_user_pages() the subpages may get PTE mapped even though it is still a THP since the page cache THP may be mapped by other processes at the mean time. Checking its _mapcount and whether the THP has PTE mapped or not. Although this may report some false negative cases (PTE mapped by other processes), it looks not trivial to make this accurate. With this fix /sys/kernel/debug/kvm/largepage would show reasonable pages are PMD mapped by EPT as the below: 7fbeaee00000-7fbfaee00000 rw-s 00000000 00:14 275464 /dev/shm/qemu_back_mem.mem.SKUvat (deleted) Size: 4194304 kB [snip] AnonHugePages: 0 kB ShmemPmdMapped: 557056 kB [snip] Locked: 0 kB cat /sys/kernel/debug/kvm/largepages 271 And the benchmarks are as same as anonymous THPs. [yang.shi@linux.alibaba.com: v4] Link: http://lkml.kernel.org/r/1571865575-42913-1-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1571769577-89735-1-git-send-email-yang.shi@linux.alibaba.com Fixes: dd78fedde4b9 ("rmap: support file thp") Signed-off-by: Yang Shi Reported-by: Gang Deng Tested-by: Gang Deng Suggested-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Matthew Wilcox Cc: [4.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/mm.h | 5 ----- include/linux/mm_types.h | 5 +++++ include/linux/page-flags.h | 20 ++++++++++++++++++-- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ee0eae21521084..858ce84ac7c514 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -549,11 +549,6 @@ static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) extern void kvfree(const void *addr); -static inline atomic_t *compound_mapcount_ptr(struct page *page) -{ - return &page[1].compound_mapcount; -} - static inline int compound_mapcount(struct page *page) { VM_BUG_ON_PAGE(!PageCompound(page), page); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e41ef532c4ce03..be5d445bac98d3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -240,6 +240,11 @@ struct page_frag_cache { typedef unsigned long vm_flags_t; +static inline atomic_t *compound_mapcount_ptr(struct page *page) +{ + return &page[1].compound_mapcount; +} + /* * A region containing a mapping of a non-memory backed file under NOMMU * conditions. These are held in a global tree and are pinned by the VMAs that diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 584b14c774c181..5f966c94732b5f 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -565,12 +565,28 @@ static inline int PageTransCompound(struct page *page) * * Unlike PageTransCompound, this is safe to be called only while * split_huge_pmd() cannot run from under us, like if protected by the - * MMU notifier, otherwise it may result in page->_mapcount < 0 false + * MMU notifier, otherwise it may result in page->_mapcount check false * positives. + * + * We have to treat page cache THP differently since every subpage of it + * would get _mapcount inc'ed once it is PMD mapped. But, it may be PTE + * mapped in the current process so comparing subpage's _mapcount to + * compound_mapcount to filter out PTE mapped case. */ static inline int PageTransCompoundMap(struct page *page) { - return PageTransCompound(page) && atomic_read(&page->_mapcount) < 0; + struct page *head; + + if (!PageTransCompound(page)) + return 0; + + if (PageAnon(page)) + return atomic_read(&page->_mapcount) < 0; + + head = compound_head(page); + /* File THP is PMD mapped and not PTE mapped */ + return atomic_read(&page->_mapcount) == + atomic_read(compound_mapcount_ptr(head)); } /* From fce52ed9e964d91ce5a02e90a22ab72e26b59046 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 5 Nov 2019 21:16:40 -0800 Subject: [PATCH 016/155] mm, vmstat: hide /proc/pagetypeinfo from normal users commit abaed0112c1db08be15a784a2c5c8a8b3063cdd3 upstream. /proc/pagetypeinfo is a debugging tool to examine internal page allocator state wrt to fragmentation. It is not very useful for any other use so normal users really do not need to read this file. Waiman Long has noticed that reading this file can have negative side effects because zone->lock is necessary for gathering data and that a) interferes with the page allocator and its users and b) can lead to hard lockups on large machines which have very long free_list. Reduce both issues by simply not exporting the file to regular users. Link: http://lkml.kernel.org/r/20191025072610.18526-2-mhocko@kernel.org Fixes: 467c996c1e19 ("Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo") Signed-off-by: Michal Hocko Reported-by: Waiman Long Acked-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Waiman Long Acked-by: Rafael Aquini Acked-by: David Rientjes Reviewed-by: Andrew Morton Cc: David Hildenbrand Cc: Johannes Weiner Cc: Roman Gushchin Cc: Konstantin Khlebnikov Cc: Jann Horn Cc: Song Liu Cc: Greg Kroah-Hartman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/vmstat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 28c45c26f901b7..ba916832641346 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1951,7 +1951,7 @@ void __init init_mm_internals(void) #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations); - proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations); + proc_create("pagetypeinfo", 0400, NULL, &pagetypeinfo_file_operations); proc_create("vmstat", 0444, NULL, &vmstat_file_operations); proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations); #endif From e8b4d457b13ea736ec5946e7f548a67c6472b61c Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Tue, 5 Nov 2019 21:16:57 -0800 Subject: [PATCH 017/155] dump_stack: avoid the livelock of the dump_lock commit 5cbf2fff3bba8d3c6a4d47c1754de1cf57e2b01f upstream. In the current code, we use the atomic_cmpxchg() to serialize the output of the dump_stack(), but this implementation suffers the thundering herd problem. We have observed such kind of livelock on a Marvell cn96xx board(24 cpus) when heavily using the dump_stack() in a kprobe handler. Actually we can let the competitors to wait for the releasing of the lock before jumping to atomic_cmpxchg(). This will definitely mitigate the thundering herd problem. Thanks Linus for the suggestion. [akpm@linux-foundation.org: fix comment] Link: http://lkml.kernel.org/r/20191030031637.6025-1-haokexin@gmail.com Fixes: b58d977432c8 ("dump_stack: serialize the output from dump_stack()") Signed-off-by: Kevin Hao Suggested-by: Linus Torvalds Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- lib/dump_stack.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/dump_stack.c b/lib/dump_stack.c index c5edbedd364dce..287ea178f0fa95 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -46,7 +46,12 @@ asmlinkage __visible void dump_stack(void) was_locked = 1; } else { local_irq_restore(flags); - cpu_relax(); + /* + * Wait for the lock to release before jumping to + * atomic_cmpxchg() in order to mitigate the thundering herd + * problem. + */ + do { cpu_relax(); } while (atomic_read(&dump_lock) != -1); goto retry; } From 16dff535dd061de9d8dd6d9e83f00b04cccc94ba Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 26 Sep 2019 19:16:41 -0600 Subject: [PATCH 018/155] tools: gpio: Use !building_out_of_srctree to determine srctree commit 4a6a6f5c4aeedb72db871d60bfcca89835f317aa upstream. make TARGETS=gpio kselftest fails with: Makefile:23: tools/build/Makefile.include: No such file or directory When the gpio tool make is invoked from tools Makefile, srctree is cleared and the current logic check for srctree equals to empty string to determine srctree location from CURDIR. When the build in invoked from selftests/gpio Makefile, the srctree is set to "." and the same logic used for srctree equals to empty is needed to determine srctree. Check building_out_of_srctree undefined as the condition for both cases to fix "make TARGETS=gpio kselftest" build failure. Cc: stable@vger.kernel.org Signed-off-by: Shuah Khan Signed-off-by: Bartosz Golaszewski Signed-off-by: Greg Kroah-Hartman --- tools/gpio/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/gpio/Makefile b/tools/gpio/Makefile index 240eda014b371b..f8bc8656a54417 100644 --- a/tools/gpio/Makefile +++ b/tools/gpio/Makefile @@ -3,7 +3,11 @@ include ../scripts/Makefile.include bindir ?= /usr/bin -ifeq ($(srctree),) +# This will work when gpio is built in tools env. where srctree +# isn't set and when invoked from selftests build, where srctree +# is set to ".". building_out_of_srctree is undefined for in srctree +# builds +ifndef building_out_of_srctree srctree := $(patsubst %/,%,$(dir $(CURDIR))) srctree := $(patsubst %/,%,$(dir $(srctree))) endif From ce117617cbefd5803d63f4eed50b0ac277995b57 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 5 Nov 2019 00:27:11 +0100 Subject: [PATCH 019/155] perf tools: Fix time sorting commit 722ddfde366fd46205456a9c5ff9b3359dc9a75e upstream. The final sort might get confused when the comparison is done over bigger numbers than int like for -s time. Check the following report for longer workloads: $ perf report -s time -F time,overhead --stdio Fix hist_entry__sort() to properly return int64_t and not possible cut int. Fixes: 043ca389a318 ("perf tools: Use hpp formats to sort final output") Signed-off-by: Jiri Olsa Reviewed-by: Andi Kleen Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: stable@vger.kernel.org # v3.16+ Link: http://lore.kernel.org/lkml/20191104232711.16055-1-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman --- tools/perf/util/hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 5b8bc1fd943d67..c1f9615b02f756 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1504,7 +1504,7 @@ int hists__collapse_resort(struct hists *hists, struct ui_progress *prog) return 0; } -static int hist_entry__sort(struct hist_entry *a, struct hist_entry *b) +static int64_t hist_entry__sort(struct hist_entry *a, struct hist_entry *b) { struct hists *hists = a->hists; struct perf_hpp_fmt *fmt; From 53c12cc84eee81d01c6c9ae6009c2cf019c55049 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 30 Oct 2019 10:21:28 -0400 Subject: [PATCH 020/155] drm/radeon: fix si_enable_smc_cac() failed issue commit 2c409ba81be25516afe05ae27a4a15da01740b01 upstream. Need to set the dte flag on this asic. Port the fix from amdgpu: 5cb818b861be114 ("drm/amd/amdgpu: fix si_enable_smc_cac() failed issue") Reviewed-by: Yong Zhao Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/si_dpm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/radeon/si_dpm.c b/drivers/gpu/drm/radeon/si_dpm.c index 90d5b41007bfd2..9e5645e4cb55b0 100644 --- a/drivers/gpu/drm/radeon/si_dpm.c +++ b/drivers/gpu/drm/radeon/si_dpm.c @@ -1956,6 +1956,7 @@ static void si_initialize_powertune_defaults(struct radeon_device *rdev) case 0x682C: si_pi->cac_weights = cac_weights_cape_verde_pro; si_pi->dte_data = dte_data_sun_xt; + update_dte_from_pl2 = true; break; case 0x6825: case 0x6827: From ab19c1ddaa02c8a99dffcec578ec30979e853e22 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Wed, 6 Nov 2019 11:59:46 -0800 Subject: [PATCH 021/155] HID: wacom: generic: Treat serial number and related fields as unsigned commit ff479731c3859609530416a18ddb3db5db019b66 upstream. The HID descriptors for most Wacom devices oddly declare the serial number and other related fields as signed integers. When these numbers are ingested by the HID subsystem, they are automatically sign-extended into 32-bit integers. We treat the fields as unsigned elsewhere in the kernel and userspace, however, so this sign-extension causes problems. In particular, the sign-extended tool ID sent to userspace as ABS_MISC does not properly match unsigned IDs used by xf86-input-wacom and libwacom. We introduce a function 'wacom_s32tou' that can undo the automatic sign extension performed by 'hid_snto32'. We call this function when processing the serial number and related fields to ensure that we are dealing with and reporting the unsigned form. We opt to use this method rather than adding a descriptor fixup in 'wacom_hid_usage_quirk' since it should be more robust in the face of future devices. Ref: https://github.com/linuxwacom/input-wacom/issues/134 Fixes: f85c9dc678 ("HID: wacom: generic: Support tool ID and additional tool types") CC: # v4.10+ Signed-off-by: Jason Gerecke Reviewed-by: Aaron Armstrong Skomra Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/wacom.h | 15 +++++++++++++++ drivers/hid/wacom_wac.c | 10 ++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/hid/wacom.h b/drivers/hid/wacom.h index 3c37c3cbf6f100..9c0900c35b236b 100644 --- a/drivers/hid/wacom.h +++ b/drivers/hid/wacom.h @@ -205,6 +205,21 @@ static inline void wacom_schedule_work(struct wacom_wac *wacom_wac, } } +/* + * Convert a signed 32-bit integer to an unsigned n-bit integer. Undoes + * the normally-helpful work of 'hid_snto32' for fields that use signed + * ranges for questionable reasons. + */ +static inline __u32 wacom_s32tou(s32 value, __u8 n) +{ + switch (n) { + case 8: return ((__u8)value); + case 16: return ((__u16)value); + case 32: return ((__u32)value); + } + return value & (1 << (n - 1)) ? value & (~(~0U << n)) : value; +} + extern const struct hid_device_id wacom_ids[]; void wacom_wac_irq(struct wacom_wac *wacom_wac, size_t len); diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 2e0c4df6ad08e2..1eb8684036644d 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -2182,7 +2182,7 @@ static void wacom_wac_pen_event(struct hid_device *hdev, struct hid_field *field case HID_DG_TOOLSERIALNUMBER: if (value) { wacom_wac->serial[0] = (wacom_wac->serial[0] & ~0xFFFFFFFFULL); - wacom_wac->serial[0] |= (__u32)value; + wacom_wac->serial[0] |= wacom_s32tou(value, field->report_size); } return; case HID_DG_TWIST: @@ -2198,15 +2198,17 @@ static void wacom_wac_pen_event(struct hid_device *hdev, struct hid_field *field return; case WACOM_HID_WD_SERIALHI: if (value) { + __u32 raw_value = wacom_s32tou(value, field->report_size); + wacom_wac->serial[0] = (wacom_wac->serial[0] & 0xFFFFFFFF); - wacom_wac->serial[0] |= ((__u64)value) << 32; + wacom_wac->serial[0] |= ((__u64)raw_value) << 32; /* * Non-USI EMR devices may contain additional tool type * information here. See WACOM_HID_WD_TOOLTYPE case for * more details. */ if (value >> 20 == 1) { - wacom_wac->id[0] |= value & 0xFFFFF; + wacom_wac->id[0] |= raw_value & 0xFFFFF; } } return; @@ -2218,7 +2220,7 @@ static void wacom_wac_pen_event(struct hid_device *hdev, struct hid_field *field * bitwise OR so the complete value can be built * up over time :( */ - wacom_wac->id[0] |= value; + wacom_wac->id[0] |= wacom_s32tou(value, field->report_size); return; case WACOM_HID_WD_OFFSETLEFT: if (features->offset_left && value != features->offset_left) From 67f5c06adf616c22d78e4b580b3899c9a8ed8366 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 6 Nov 2019 15:41:05 +0000 Subject: [PATCH 022/155] arm64: Do not mask out PTE_RDONLY in pte_same() commit 6767df245f4736d0cf0c6fb7cf9cf94b27414245 upstream. Following commit 73e86cb03cf2 ("arm64: Move PTE_RDONLY bit handling out of set_pte_at()"), the PTE_RDONLY bit is no longer managed by set_pte_at() but built into the PAGE_* attribute definitions. Consequently, pte_same() must include this bit when checking two PTEs for equality. Remove the arm64-specific pte_same() function, practically reverting commit 747a70e60b72 ("arm64: Fix copy-on-write referencing in HugeTLB") Fixes: 73e86cb03cf2 ("arm64: Move PTE_RDONLY bit handling out of set_pte_at()") Cc: # 4.14.x- Cc: Will Deacon Cc: Steve Capper Reported-by: John Stultz Signed-off-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/pgtable.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 4cf248185e6f97..aa3b8dd8fc35f4 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -258,23 +258,6 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, set_pte(ptep, pte); } -#define __HAVE_ARCH_PTE_SAME -static inline int pte_same(pte_t pte_a, pte_t pte_b) -{ - pteval_t lhs, rhs; - - lhs = pte_val(pte_a); - rhs = pte_val(pte_b); - - if (pte_present(pte_a)) - lhs &= ~PTE_RDONLY; - - if (pte_present(pte_b)) - rhs &= ~PTE_RDONLY; - - return (lhs == rhs); -} - /* * Huge pte definitions. */ From 58af68f4f16de69243dea8ee501da43dd22d6d65 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Fri, 25 Oct 2019 14:05:24 +0100 Subject: [PATCH 023/155] ceph: fix use-after-free in __ceph_remove_cap() commit ea60ed6fcf29eebc78f2ce91491e6309ee005a01 upstream. KASAN reports a use-after-free when running xfstest generic/531, with the following trace: [ 293.903362] kasan_report+0xe/0x20 [ 293.903365] rb_erase+0x1f/0x790 [ 293.903370] __ceph_remove_cap+0x201/0x370 [ 293.903375] __ceph_remove_caps+0x4b/0x70 [ 293.903380] ceph_evict_inode+0x4e/0x360 [ 293.903386] evict+0x169/0x290 [ 293.903390] __dentry_kill+0x16f/0x250 [ 293.903394] dput+0x1c6/0x440 [ 293.903398] __fput+0x184/0x330 [ 293.903404] task_work_run+0xb9/0xe0 [ 293.903410] exit_to_usermode_loop+0xd3/0xe0 [ 293.903413] do_syscall_64+0x1a0/0x1c0 [ 293.903417] entry_SYSCALL_64_after_hwframe+0x44/0xa9 This happens because __ceph_remove_cap() may queue a cap release (__ceph_queue_cap_release) which can be scheduled before that cap is removed from the inode list with rb_erase(&cap->ci_node, &ci->i_caps); And, when this finally happens, the use-after-free will occur. This can be fixed by removing the cap from the inode list before being removed from the session list, and thus eliminating the risk of an UAF. Cc: stable@vger.kernel.org Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov Signed-off-by: Greg Kroah-Hartman --- fs/ceph/caps.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index df95e39ccd45b6..c3a3ee74e2d840 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -935,6 +935,11 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + /* remove from inode's cap rbtree, and clear auth cap */ + rb_erase(&cap->ci_node, &ci->i_caps); + if (ci->i_auth_cap == cap) + ci->i_auth_cap = NULL; + /* remove from session list */ spin_lock(&session->s_cap_lock); if (session->s_cap_iterator == cap) { @@ -970,11 +975,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) spin_unlock(&session->s_cap_lock); - /* remove from inode list */ - rb_erase(&cap->ci_node, &ci->i_caps); - if (ci->i_auth_cap == cap) - ci->i_auth_cap = NULL; - if (removed) ceph_put_cap(mdsc, cap); From 0e65dac6c9ec068f9cc1902535207f9c8bc57948 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 29 Oct 2019 13:53:29 +0000 Subject: [PATCH 024/155] ceph: add missing check in d_revalidate snapdir handling commit 1f08529c84cfecaf1261ed9b7e17fab18541c58f upstream. We should not play with dcache without parent locked... Cc: stable@vger.kernel.org Signed-off-by: Al Viro Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov Signed-off-by: Greg Kroah-Hartman --- fs/ceph/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 879bc082509318..3818027c12f5a1 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1347,6 +1347,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) dout(" final dn %p\n", dn); } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || req->r_op == CEPH_MDS_OP_MKSNAP) && + test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { struct dentry *dn = req->r_dentry; struct inode *dir = req->r_parent; From ba18d43fd9d0e383bdea7f108e08f2eef83a104b Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Fri, 25 Oct 2019 17:04:20 +0200 Subject: [PATCH 025/155] iio: adc: stm32-adc: fix stopping dma commit e6afcf6c598d6f3a0c9c408bfeddb3f5730608b0 upstream. There maybe a race when using dmaengine_terminate_all(). The predisable routine may call iio_triggered_buffer_predisable() prior to a pending DMA callback. Adopt dmaengine_terminate_sync() to ensure there's no pending DMA request before calling iio_triggered_buffer_predisable(). Fixes: 2763ea0585c9 ("iio: adc: stm32: add optional dma support") Signed-off-by: Fabrice Gasnier Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/stm32-adc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/adc/stm32-adc.c b/drivers/iio/adc/stm32-adc.c index e59cbc9ad4f6fa..258a4712167a26 100644 --- a/drivers/iio/adc/stm32-adc.c +++ b/drivers/iio/adc/stm32-adc.c @@ -1343,7 +1343,7 @@ static int stm32_adc_dma_start(struct iio_dev *indio_dev) cookie = dmaengine_submit(desc); ret = dma_submit_error(cookie); if (ret) { - dmaengine_terminate_all(adc->dma_chan); + dmaengine_terminate_sync(adc->dma_chan); return ret; } @@ -1416,7 +1416,7 @@ static int stm32_adc_buffer_predisable(struct iio_dev *indio_dev) dev_err(&indio_dev->dev, "predisable failed\n"); if (adc->dma_chan) - dmaengine_terminate_all(adc->dma_chan); + dmaengine_terminate_sync(adc->dma_chan); if (stm32_adc_set_trig(indio_dev, NULL)) dev_err(&indio_dev->dev, "Can't clear trigger\n"); From 52c71b2283f3b695c29c0488fb2043c6d077e3ff Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Tue, 8 Oct 2019 17:15:37 +0300 Subject: [PATCH 026/155] iio: imu: adis16480: make sure provided frequency is positive commit 24e1eb5c0d78cfb9750b690bbe997d4d59170258 upstream. It could happen that either `val` or `val2` [provided from userspace] is negative. In that case the computed frequency could get a weird value. Fix this by checking that neither of the 2 variables is negative, and check that the computed result is not-zero. Fixes: e4f959390178 ("iio: imu: adis16480 switch sampling frequency attr to core support") Signed-off-by: Alexandru Ardelean Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/imu/adis16480.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/iio/imu/adis16480.c b/drivers/iio/imu/adis16480.c index 12898424d8382f..6f975538996cdc 100644 --- a/drivers/iio/imu/adis16480.c +++ b/drivers/iio/imu/adis16480.c @@ -266,8 +266,11 @@ static int adis16480_set_freq(struct iio_dev *indio_dev, int val, int val2) struct adis16480 *st = iio_priv(indio_dev); unsigned int t; + if (val < 0 || val2 < 0) + return -EINVAL; + t = val * 1000 + val2 / 1000; - if (t <= 0) + if (t == 0) return -EINVAL; t = 2460000 / t; From d380065d7943c5b31b67cd2d5c2ee6745567c7fe Mon Sep 17 00:00:00 2001 From: Andreas Klinger Date: Sun, 6 Oct 2019 16:29:56 +0200 Subject: [PATCH 027/155] iio: srf04: fix wrong limitation in distance measuring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 431f7667bd6889a274913162dfd19cce9d84848e upstream. The measured time value in the driver is limited to the maximum distance which can be read by the sensor. This limitation was wrong and is fixed by this patch. It also takes into account that we are supporting a variety of sensors today and that the recently added sensors have a higher maximum distance range. Changes in v2: - Added a Tested-by Suggested-by: ZbynÄ›k Kocur Tested-by: ZbynÄ›k Kocur Signed-off-by: Andreas Klinger Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/proximity/srf04.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/iio/proximity/srf04.c b/drivers/iio/proximity/srf04.c index e37667f933b36a..8a6ab9691832db 100644 --- a/drivers/iio/proximity/srf04.c +++ b/drivers/iio/proximity/srf04.c @@ -105,7 +105,7 @@ static int srf04_read(struct srf04_data *data) udelay(10); gpiod_set_value(data->gpiod_trig, 0); - /* it cannot take more than 20 ms */ + /* it should not take more than 20 ms until echo is rising */ ret = wait_for_completion_killable_timeout(&data->rising, HZ/50); if (ret < 0) { mutex_unlock(&data->lock); @@ -115,7 +115,8 @@ static int srf04_read(struct srf04_data *data) return -ETIMEDOUT; } - ret = wait_for_completion_killable_timeout(&data->falling, HZ/50); + /* it cannot take more than 50 ms until echo is falling */ + ret = wait_for_completion_killable_timeout(&data->falling, HZ/20); if (ret < 0) { mutex_unlock(&data->lock); return ret; @@ -130,19 +131,19 @@ static int srf04_read(struct srf04_data *data) dt_ns = ktime_to_ns(ktime_dt); /* - * measuring more than 3 meters is beyond the capabilities of - * the sensor + * measuring more than 6,45 meters is beyond the capabilities of + * the supported sensors * ==> filter out invalid results for not measuring echos of * another us sensor * * formula: - * distance 3 m - * time = ---------- = --------- = 9404389 ns - * speed 319 m/s + * distance 6,45 * 2 m + * time = ---------- = ------------ = 40438871 ns + * speed 319 m/s * * using a minimum speed at -20 °C of 319 m/s */ - if (dt_ns > 9404389) + if (dt_ns > 40438871) return -EIO; time_ns = dt_ns; @@ -154,20 +155,20 @@ static int srf04_read(struct srf04_data *data) * with Temp in °C * and speed in m/s * - * use 343 m/s as ultrasonic speed at 20 °C here in absence of the + * use 343,5 m/s as ultrasonic speed at 20 °C here in absence of the * temperature * * therefore: - * time 343 - * distance = ------ * ----- - * 10^6 2 + * time 343,5 time * 106 + * distance = ------ * ------- = ------------ + * 10^6 2 617176 * with time in ns * and distance in mm (one way) * - * because we limit to 3 meters the multiplication with 343 just + * because we limit to 6,45 meters the multiplication with 106 just * fits into 32 bit */ - distance_mm = time_ns * 343 / 2000000; + distance_mm = time_ns * 106 / 617176; return distance_mm; } From f8385a4e9ed83dc727aa51cdc51c23a37159c181 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Thu, 31 Oct 2019 11:06:24 +0100 Subject: [PATCH 028/155] netfilter: nf_tables: Align nft_expr private data to 64-bit commit 250367c59e6ba0d79d702a059712d66edacd4a1a upstream. Invoking the following commands on a 32-bit architecture with strict alignment requirements (such as an ARMv7-based Raspberry Pi) results in an alignment exception: # nft add table ip test-ip4 # nft add chain ip test-ip4 output { type filter hook output priority 0; } # nft add rule ip test-ip4 output quota 1025 bytes Alignment trap: not handling instruction e1b26f9f at [<7f4473f8>] Unhandled fault: alignment exception (0x001) at 0xb832e824 Internal error: : 1 [#1] PREEMPT SMP ARM Hardware name: BCM2835 [<7f4473fc>] (nft_quota_do_init [nft_quota]) [<7f447448>] (nft_quota_init [nft_quota]) [<7f4260d0>] (nf_tables_newrule [nf_tables]) [<7f4168dc>] (nfnetlink_rcv_batch [nfnetlink]) [<7f416bd0>] (nfnetlink_rcv [nfnetlink]) [<8078b334>] (netlink_unicast) [<8078b664>] (netlink_sendmsg) [<8071b47c>] (sock_sendmsg) [<8071bd18>] (___sys_sendmsg) [<8071ce3c>] (__sys_sendmsg) [<8071ce94>] (sys_sendmsg) The reason is that nft_quota_do_init() calls atomic64_set() on an atomic64_t which is only aligned to 32-bit, not 64-bit, because it succeeds struct nft_expr in memory which only contains a 32-bit pointer. Fix by aligning the nft_expr private data to 64-bit. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Lukas Wunner Cc: stable@vger.kernel.org # v3.13+ Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- include/net/netfilter/nf_tables.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 59a4f50ffe8dcf..a9704c57430dba 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -759,7 +759,8 @@ struct nft_expr_ops { */ struct nft_expr { const struct nft_expr_ops *ops; - unsigned char data[]; + unsigned char data[] + __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_expr_priv(const struct nft_expr *expr) From ef1bf406881d61512911b56c77e96a1489f136b6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 24 Aug 2019 17:49:55 +0300 Subject: [PATCH 029/155] netfilter: ipset: Fix an error code in ip_set_sockfn_get() commit 30b7244d79651460ff114ba8f7987ed94c86b99a upstream. The copy_to_user() function returns the number of bytes remaining to be copied. In this code, that positive return is checked at the end of the function and we return zero/success. What we should do instead is return -EFAULT. Fixes: a7b4f989a629 ("netfilter: ipset: IP set core support") Signed-off-by: Dan Carpenter Signed-off-by: Jozsef Kadlecsik Signed-off-by: Greg Kroah-Hartman --- net/netfilter/ipset/ip_set_core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index dbf17d3596a691..94d74ec61f42cc 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1950,8 +1950,9 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) } req_version->version = IPSET_PROTOCOL; - ret = copy_to_user(user, req_version, - sizeof(struct ip_set_req_version)); + if (copy_to_user(user, req_version, + sizeof(struct ip_set_req_version))) + ret = -EFAULT; goto done; } case IP_SET_OP_GET_BYNAME: { @@ -2008,7 +2009,8 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) } /* end of switch(op) */ copy: - ret = copy_to_user(user, data, copylen); + if (copy_to_user(user, data, copylen)) + ret = -EFAULT; done: vfree(data); From 5b300c0df1c1c603c0bb136eb7d827202215631f Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Mon, 28 Oct 2019 09:06:50 +0200 Subject: [PATCH 030/155] intel_th: pci: Add Comet Lake PCH support commit 3adbb5718dd5264666ddbc2b9b43799d292e9cb6 upstream. This adds support for Intel TH on Comet Lake PCH. Signed-off-by: Alexander Shishkin Reviewed-by: Andy Shevchenko Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20191028070651.9770-7-alexander.shishkin@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c index 140b18d858e8a3..05b57425b9a9fb 100644 --- a/drivers/hwtracing/intel_th/pci.c +++ b/drivers/hwtracing/intel_th/pci.c @@ -183,6 +183,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x02a6), .driver_data = (kernel_ulong_t)&intel_th_2x, }, + { + /* Comet Lake PCH */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x06a6), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, { /* Ice Lake NNPI */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x45c5), From a061cd0bc0b1b7365a57d15622f1ca2b90dd9b07 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Mon, 28 Oct 2019 09:06:51 +0200 Subject: [PATCH 031/155] intel_th: pci: Add Jasper Lake PCH support commit 9d55499d8da49e9261e95a490f3fda41d955f505 upstream. This adds support for Intel TH on Jasper Lake PCH. Signed-off-by: Alexander Shishkin Reviewed-by: Andy Shevchenko Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20191028070651.9770-8-alexander.shishkin@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c index 05b57425b9a9fb..7486d5d671860a 100644 --- a/drivers/hwtracing/intel_th/pci.c +++ b/drivers/hwtracing/intel_th/pci.c @@ -198,6 +198,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa0a6), .driver_data = (kernel_ulong_t)&intel_th_2x, }, + { + /* Jasper Lake PCH */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x4da6), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, { 0 }, }; From 0d9e2feded19e7fc8212acc0202befcfef96bb97 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 1 Oct 2019 12:29:14 +0200 Subject: [PATCH 032/155] can: usb_8dev: fix use-after-free on disconnect commit 3759739426186a924675651b388d1c3963c5710e upstream. The driver was accessing its driver data after having freed it. Fixes: 0024d8ad1639 ("can: usb_8dev: Add support for USB2CAN interface from 8 devices") Cc: stable # 3.9 Cc: Bernd Krumboeck Cc: Wolfgang Grandegger Signed-off-by: Johan Hovold Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/usb/usb_8dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c index 27861c417c9404..3e44164736079d 100644 --- a/drivers/net/can/usb/usb_8dev.c +++ b/drivers/net/can/usb/usb_8dev.c @@ -1007,9 +1007,8 @@ static void usb_8dev_disconnect(struct usb_interface *intf) netdev_info(priv->netdev, "device disconnected\n"); unregister_netdev(priv->netdev); - free_candev(priv->netdev); - unlink_all_urbs(priv); + free_candev(priv->netdev); } } From 774e85d16ac2fe18fba76c143aedb8dfbb212f80 Mon Sep 17 00:00:00 2001 From: Kurt Van Dijck Date: Tue, 1 Oct 2019 09:40:36 +0200 Subject: [PATCH 033/155] can: c_can: c_can_poll(): only read status register after status IRQ commit 3cb3eaac52c0f145d895f4b6c22834d5f02b8569 upstream. When the status register is read without the status IRQ pending, the chip may not raise the interrupt line for an upcoming status interrupt and the driver may miss a status interrupt. It is critical that the BUSOFF status interrupt is forwarded to the higher layers, since no more interrupts will follow without intervention. Thanks to Wolfgang and Joe for bringing up the first idea. Signed-off-by: Kurt Van Dijck Cc: Wolfgang Grandegger Cc: Joe Burmeister Fixes: fa39b54ccf28 ("can: c_can: Get rid of pointless interrupts") Cc: linux-stable Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/c_can/c_can.c | 25 ++++++++++++++++++++----- drivers/net/can/c_can/c_can.h | 1 + 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c index 606b7d8ffe138f..9b61bfbea6cd1a 100644 --- a/drivers/net/can/c_can/c_can.c +++ b/drivers/net/can/c_can/c_can.c @@ -97,6 +97,9 @@ #define BTR_TSEG2_SHIFT 12 #define BTR_TSEG2_MASK (0x7 << BTR_TSEG2_SHIFT) +/* interrupt register */ +#define INT_STS_PENDING 0x8000 + /* brp extension register */ #define BRP_EXT_BRPE_MASK 0x0f #define BRP_EXT_BRPE_SHIFT 0 @@ -1029,10 +1032,16 @@ static int c_can_poll(struct napi_struct *napi, int quota) u16 curr, last = priv->last_status; int work_done = 0; - priv->last_status = curr = priv->read_reg(priv, C_CAN_STS_REG); - /* Ack status on C_CAN. D_CAN is self clearing */ - if (priv->type != BOSCH_D_CAN) - priv->write_reg(priv, C_CAN_STS_REG, LEC_UNUSED); + /* Only read the status register if a status interrupt was pending */ + if (atomic_xchg(&priv->sie_pending, 0)) { + priv->last_status = curr = priv->read_reg(priv, C_CAN_STS_REG); + /* Ack status on C_CAN. D_CAN is self clearing */ + if (priv->type != BOSCH_D_CAN) + priv->write_reg(priv, C_CAN_STS_REG, LEC_UNUSED); + } else { + /* no change detected ... */ + curr = last; + } /* handle state changes */ if ((curr & STATUS_EWARN) && (!(last & STATUS_EWARN))) { @@ -1083,10 +1092,16 @@ static irqreturn_t c_can_isr(int irq, void *dev_id) { struct net_device *dev = (struct net_device *)dev_id; struct c_can_priv *priv = netdev_priv(dev); + int reg_int; - if (!priv->read_reg(priv, C_CAN_INT_REG)) + reg_int = priv->read_reg(priv, C_CAN_INT_REG); + if (!reg_int) return IRQ_NONE; + /* save for later use */ + if (reg_int & INT_STS_PENDING) + atomic_set(&priv->sie_pending, 1); + /* disable all interrupts and schedule the NAPI */ c_can_irq_control(priv, false); napi_schedule(&priv->napi); diff --git a/drivers/net/can/c_can/c_can.h b/drivers/net/can/c_can/c_can.h index 8acdc7fa4792f2..d5567a7c1c6d46 100644 --- a/drivers/net/can/c_can/c_can.h +++ b/drivers/net/can/c_can/c_can.h @@ -198,6 +198,7 @@ struct c_can_priv { struct net_device *dev; struct device *device; atomic_t tx_active; + atomic_t sie_pending; unsigned long tx_dir; int last_status; u16 (*read_reg) (const struct c_can_priv *priv, enum reg index); From 3ee2b01c435ba5393764e3b187d3b728745ecf94 Mon Sep 17 00:00:00 2001 From: Stephane Grosjean Date: Tue, 8 Oct 2019 10:35:44 +0200 Subject: [PATCH 034/155] can: peak_usb: fix a potential out-of-sync while decoding packets commit de280f403f2996679e2607384980703710576fed upstream. When decoding a buffer received from PCAN-USB, the first timestamp read in a packet is a 16-bit coded time base, and the next ones are an 8-bit offset to this base, regardless of the type of packet read. This patch corrects a potential loss of synchronization by using a timestamp index read from the buffer, rather than an index of received data packets, to determine on the sizeof the timestamp to be read from the packet being decoded. Signed-off-by: Stephane Grosjean Fixes: 46be265d3388 ("can: usb: PEAK-System Technik PCAN-USB specific part") Cc: linux-stable Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/usb/peak_usb/pcan_usb.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c index 838545ce468d1b..e626c2afbbb11e 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb.c @@ -108,7 +108,7 @@ struct pcan_usb_msg_context { u8 *end; u8 rec_cnt; u8 rec_idx; - u8 rec_data_idx; + u8 rec_ts_idx; struct net_device *netdev; struct pcan_usb *pdev; }; @@ -552,10 +552,15 @@ static int pcan_usb_decode_status(struct pcan_usb_msg_context *mc, mc->ptr += PCAN_USB_CMD_ARGS; if (status_len & PCAN_USB_STATUSLEN_TIMESTAMP) { - int err = pcan_usb_decode_ts(mc, !mc->rec_idx); + int err = pcan_usb_decode_ts(mc, !mc->rec_ts_idx); if (err) return err; + + /* Next packet in the buffer will have a timestamp on a single + * byte + */ + mc->rec_ts_idx++; } switch (f) { @@ -638,10 +643,13 @@ static int pcan_usb_decode_data(struct pcan_usb_msg_context *mc, u8 status_len) cf->can_dlc = get_can_dlc(rec_len); - /* first data packet timestamp is a word */ - if (pcan_usb_decode_ts(mc, !mc->rec_data_idx)) + /* Only first packet timestamp is a word */ + if (pcan_usb_decode_ts(mc, !mc->rec_ts_idx)) goto decode_failed; + /* Next packet in the buffer will have a timestamp on a single byte */ + mc->rec_ts_idx++; + /* read data */ memset(cf->data, 0x0, sizeof(cf->data)); if (status_len & PCAN_USB_STATUSLEN_RTR) { @@ -695,7 +703,6 @@ static int pcan_usb_decode_msg(struct peak_usb_device *dev, u8 *ibuf, u32 lbuf) /* handle normal can frames here */ } else { err = pcan_usb_decode_data(&mc, sl); - mc.rec_data_idx++; } } From f0468c0630a52ed3909321de8c057c9f8e5e8ada Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 9 Oct 2019 15:48:48 +0200 Subject: [PATCH 035/155] can: rx-offload: can_rx_offload_queue_sorted(): fix error handling, avoid skb mem leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit ca913f1ac024559ebc17f0b599af262f0ad997c9 upstream. If the rx-offload skb_queue is full can_rx_offload_queue_sorted() will not queue the skb and return with an error. None of the callers of this function, issue a kfree_skb() to free the not queued skb. This results in a memory leak. This patch fixes the problem by freeing the skb in case of a full queue. The return value is adjusted to -ENOBUFS to better reflect the actual problem. The device stats handling is left to the callers, as this function might be used in both the rx and tx path. Fixes: 55059f2b7f86 ("can: rx-offload: introduce can_rx_offload_get_echo_skb() and can_rx_offload_queue_sorted() functions") Cc: linux-stable Cc: Martin Hundebøll Reported-by: Martin Hundebøll Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/rx-offload.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/rx-offload.c b/drivers/net/can/rx-offload.c index d227db45fec977..1a7c183e66783c 100644 --- a/drivers/net/can/rx-offload.c +++ b/drivers/net/can/rx-offload.c @@ -216,8 +216,10 @@ int can_rx_offload_queue_sorted(struct can_rx_offload *offload, unsigned long flags; if (skb_queue_len(&offload->skb_queue) > - offload->skb_queue_len_max) - return -ENOMEM; + offload->skb_queue_len_max) { + kfree_skb(skb); + return -ENOBUFS; + } cb = can_rx_offload_get_cb(skb); cb->timestamp = timestamp; From 3f5e99731cd55f9be6468e9af7985a838ce7255b Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Thu, 19 Sep 2019 21:44:38 -0500 Subject: [PATCH 036/155] can: gs_usb: gs_can_open(): prevent memory leak commit fb5be6a7b4863ecc44963bb80ca614584b6c7817 upstream. In gs_can_open() if usb_submit_urb() fails the allocated urb should be released. Fixes: d08e973a77d1 ("can: gs_usb: Added support for the GS_USB CAN devices") Cc: linux-stable Signed-off-by: Navid Emamdoost Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/usb/gs_usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index 8bf80ad9dc44cf..bfbf809496009b 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -631,6 +631,7 @@ static int gs_can_open(struct net_device *netdev) rc); usb_unanchor_urb(urb); + usb_free_urb(urb); break; } From 72e535208bc328e0617a8adb242bf8d89cdfd7d4 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 1 Oct 2019 12:29:13 +0200 Subject: [PATCH 037/155] can: mcba_usb: fix use-after-free on disconnect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4d6636498c41891d0482a914dd570343a838ad79 upstream. The driver was accessing its driver data after having freed it. Fixes: 51f3baad7de9 ("can: mcba_usb: Add support for Microchip CAN BUS Analyzer") Cc: stable # 4.12 Cc: Remigiusz KoÅ‚Å‚Ä…taj Reported-by: syzbot+e29b17e5042bbc56fae9@syzkaller.appspotmail.com Signed-off-by: Johan Hovold Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/usb/mcba_usb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c index e0c24abce16ce9..070e1ba7973690 100644 --- a/drivers/net/can/usb/mcba_usb.c +++ b/drivers/net/can/usb/mcba_usb.c @@ -887,9 +887,8 @@ static void mcba_usb_disconnect(struct usb_interface *intf) netdev_info(priv->netdev, "device disconnected\n"); unregister_candev(priv->netdev); - free_candev(priv->netdev); - mcba_urb_unlink(priv); + free_candev(priv->netdev); } static struct usb_driver mcba_usb_driver = { From 89a23a5d03956b44eac1a778806449577d690c6c Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 23 Oct 2019 10:27:05 +0200 Subject: [PATCH 038/155] can: peak_usb: fix slab info leak commit f7a1337f0d29b98733c8824e165fca3371d7d4fd upstream. Fix a small slab info leak due to a failure to clear the command buffer at allocation. The first 16 bytes of the command buffer are always sent to the device in pcan_usb_send_cmd() even though only the first two may have been initialised in case no argument payload is provided (e.g. when waiting for a response). Fixes: bb4785551f64 ("can: usb: PEAK-System Technik USB adapters driver core") Cc: stable # 3.4 Reported-by: syzbot+863724e7128e14b26732@syzkaller.appspotmail.com Signed-off-by: Johan Hovold Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/usb/peak_usb/pcan_usb_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c index 059282a6065c6a..85d92f129af2d8 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c @@ -776,7 +776,7 @@ static int peak_usb_create_dev(const struct peak_usb_adapter *peak_usb_adapter, dev = netdev_priv(netdev); /* allocate a buffer large enough to send commands */ - dev->cmd_buf = kmalloc(PCAN_USB_MAX_CMD_LEN, GFP_KERNEL); + dev->cmd_buf = kzalloc(PCAN_USB_MAX_CMD_LEN, GFP_KERNEL); if (!dev->cmd_buf) { err = -ENOMEM; goto lbl_free_candev; From 2661af799789d9c810e54124585ffc76705b0857 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Sat, 7 Oct 2017 16:02:21 +0200 Subject: [PATCH 039/155] configfs: Fix bool initialization/comparison commit 3f6928c347707a65cee10a9f54b85ad5fb078b3f upstream. Bool initializations should use true and false. Bool tests don't need comparisons. Signed-off-by: Thomas Meyer Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- fs/configfs/file.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 39da1103d34153..62580dba355212 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -166,7 +166,7 @@ configfs_read_bin_file(struct file *file, char __user *buf, retval = -ETXTBSY; goto out; } - buffer->read_in_progress = 1; + buffer->read_in_progress = true; if (buffer->needs_read_fill) { /* perform first read with buf == NULL to get extent */ @@ -325,7 +325,7 @@ configfs_write_bin_file(struct file *file, const char __user *buf, len = -ETXTBSY; goto out; } - buffer->write_in_progress = 1; + buffer->write_in_progress = true; /* buffer grows? */ if (*ppos + count > buffer->bin_buffer_size) { @@ -429,8 +429,8 @@ static int check_perm(struct inode * inode, struct file * file, int type) } mutex_init(&buffer->mutex); buffer->needs_read_fill = 1; - buffer->read_in_progress = 0; - buffer->write_in_progress = 0; + buffer->read_in_progress = false; + buffer->write_in_progress = false; buffer->ops = ops; file->private_data = buffer; goto Done; @@ -488,10 +488,10 @@ static int configfs_release_bin_file(struct inode *inode, struct file *filp) ssize_t len = 0; int ret; - buffer->read_in_progress = 0; + buffer->read_in_progress = false; if (buffer->write_in_progress) { - buffer->write_in_progress = 0; + buffer->write_in_progress = false; len = bin_attr->write(item, buffer->bin_buffer, buffer->bin_buffer_size); From 3e651dd60272220ebfab745c929950feb2a90e49 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 30 Aug 2019 11:30:03 -0400 Subject: [PATCH 040/155] configfs: stash the data we need into configfs_buffer at open time commit ff4dd081977da56566a848f071aed8fa92d604a1 upstream. simplifies the ->read()/->write()/->release() instances nicely Signed-off-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- fs/configfs/file.c | 229 +++++++++++++++++++-------------------------- 1 file changed, 95 insertions(+), 134 deletions(-) diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 62580dba355212..c05ffda74a9137 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -53,24 +53,18 @@ struct configfs_buffer { bool write_in_progress; char *bin_buffer; int bin_buffer_size; + int cb_max_size; + struct config_item *item; + struct module *owner; + union { + struct configfs_attribute *attr; + struct configfs_bin_attribute *bin_attr; + }; }; -/** - * fill_read_buffer - allocate and fill buffer from item. - * @dentry: dentry pointer. - * @buffer: data buffer for file. - * - * Allocate @buffer->page, if it hasn't been already, then call the - * config_item's show() method to fill the buffer with this attribute's - * data. - * This is called only once, on the file's first read. - */ -static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer) +static int fill_read_buffer(struct configfs_buffer * buffer) { - struct configfs_attribute * attr = to_attr(dentry); - struct config_item * item = to_item(dentry->d_parent); - int ret = 0; ssize_t count; if (!buffer->page) @@ -78,15 +72,15 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf if (!buffer->page) return -ENOMEM; - count = attr->show(item, buffer->page); + count = buffer->attr->show(buffer->item, buffer->page); + if (count < 0) + return count; + if (WARN_ON_ONCE(count > (ssize_t)SIMPLE_ATTR_SIZE)) + return -EIO; - BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE); - if (count >= 0) { - buffer->needs_read_fill = 0; - buffer->count = count; - } else - ret = count; - return ret; + buffer->needs_read_fill = 0; + buffer->count = count; + return 0; } /** @@ -111,12 +105,13 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf static ssize_t configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct configfs_buffer * buffer = file->private_data; + struct configfs_buffer *buffer = file->private_data; ssize_t retval = 0; mutex_lock(&buffer->mutex); if (buffer->needs_read_fill) { - if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) + retval = fill_read_buffer(buffer); + if (retval) goto out; } pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", @@ -153,9 +148,6 @@ configfs_read_bin_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct configfs_buffer *buffer = file->private_data; - struct dentry *dentry = file->f_path.dentry; - struct config_item *item = to_item(dentry->d_parent); - struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); ssize_t retval = 0; ssize_t len = min_t(size_t, count, PAGE_SIZE); @@ -170,14 +162,14 @@ configfs_read_bin_file(struct file *file, char __user *buf, if (buffer->needs_read_fill) { /* perform first read with buf == NULL to get extent */ - len = bin_attr->read(item, NULL, 0); + len = buffer->bin_attr->read(buffer->item, NULL, 0); if (len <= 0) { retval = len; goto out; } /* do not exceed the maximum value */ - if (bin_attr->cb_max_size && len > bin_attr->cb_max_size) { + if (buffer->cb_max_size && len > buffer->cb_max_size) { retval = -EFBIG; goto out; } @@ -190,7 +182,8 @@ configfs_read_bin_file(struct file *file, char __user *buf, buffer->bin_buffer_size = len; /* perform second read to fill buffer */ - len = bin_attr->read(item, buffer->bin_buffer, len); + len = buffer->bin_attr->read(buffer->item, + buffer->bin_buffer, len); if (len < 0) { retval = len; vfree(buffer->bin_buffer); @@ -240,25 +233,10 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size return error ? -EFAULT : count; } - -/** - * flush_write_buffer - push buffer to config_item. - * @dentry: dentry to the attribute - * @buffer: data buffer for file. - * @count: number of bytes - * - * Get the correct pointers for the config_item and the attribute we're - * dealing with, then call the store() method for the attribute, - * passing the buffer that we acquired in fill_write_buffer(). - */ - static int -flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count) +flush_write_buffer(struct configfs_buffer *buffer, size_t count) { - struct configfs_attribute * attr = to_attr(dentry); - struct config_item * item = to_item(dentry->d_parent); - - return attr->store(item, buffer->page, count); + return buffer->attr->store(buffer->item, buffer->page, count); } @@ -282,13 +260,13 @@ flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size static ssize_t configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct configfs_buffer * buffer = file->private_data; + struct configfs_buffer *buffer = file->private_data; ssize_t len; mutex_lock(&buffer->mutex); len = fill_write_buffer(buffer, buf, count); if (len > 0) - len = flush_write_buffer(file->f_path.dentry, buffer, len); + len = flush_write_buffer(buffer, len); if (len > 0) *ppos += len; mutex_unlock(&buffer->mutex); @@ -313,8 +291,6 @@ configfs_write_bin_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct configfs_buffer *buffer = file->private_data; - struct dentry *dentry = file->f_path.dentry; - struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); void *tbuf = NULL; ssize_t len; @@ -330,8 +306,8 @@ configfs_write_bin_file(struct file *file, const char __user *buf, /* buffer grows? */ if (*ppos + count > buffer->bin_buffer_size) { - if (bin_attr->cb_max_size && - *ppos + count > bin_attr->cb_max_size) { + if (buffer->cb_max_size && + *ppos + count > buffer->cb_max_size) { len = -EFBIG; goto out; } @@ -363,31 +339,45 @@ configfs_write_bin_file(struct file *file, const char __user *buf, return len; } -static int check_perm(struct inode * inode, struct file * file, int type) +static int __configfs_open_file(struct inode *inode, struct file *file, int type) { - struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent); - struct configfs_attribute * attr = to_attr(file->f_path.dentry); - struct configfs_bin_attribute *bin_attr = NULL; - struct configfs_buffer * buffer; - struct configfs_item_operations * ops = NULL; - int error = 0; + struct dentry *dentry = file->f_path.dentry; + struct configfs_attribute *attr; + struct configfs_buffer *buffer; + int error; - if (!item || !attr) - goto Einval; + error = -ENOMEM; + buffer = kzalloc(sizeof(struct configfs_buffer), GFP_KERNEL); + if (!buffer) + goto out; - if (type & CONFIGFS_ITEM_BIN_ATTR) - bin_attr = to_bin_attr(file->f_path.dentry); + error = -EINVAL; + buffer->item = configfs_get_config_item(dentry->d_parent); + if (!buffer->item) + goto out_free_buffer; + + attr = to_attr(dentry); + if (!attr) + goto out_put_item; + + if (type & CONFIGFS_ITEM_BIN_ATTR) { + buffer->bin_attr = to_bin_attr(dentry); + buffer->cb_max_size = buffer->bin_attr->cb_max_size; + } else { + buffer->attr = attr; + } + buffer->owner = attr->ca_owner; /* Grab the module reference for this attribute if we have one */ - if (!try_module_get(attr->ca_owner)) { - error = -ENODEV; - goto Done; - } + error = -ENODEV; + if (!try_module_get(buffer->owner)) + goto out_put_item; - if (item->ci_type) - ops = item->ci_type->ct_item_ops; - else - goto Eaccess; + error = -EACCES; + if (!buffer->item->ci_type) + goto out_put_module; + + buffer->ops = buffer->item->ci_type->ct_item_ops; /* File needs write support. * The inode's perms must say it's ok, @@ -395,13 +385,11 @@ static int check_perm(struct inode * inode, struct file * file, int type) */ if (file->f_mode & FMODE_WRITE) { if (!(inode->i_mode & S_IWUGO)) - goto Eaccess; - + goto out_put_module; if ((type & CONFIGFS_ITEM_ATTR) && !attr->store) - goto Eaccess; - - if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->write) - goto Eaccess; + goto out_put_module; + if ((type & CONFIGFS_ITEM_BIN_ATTR) && !buffer->bin_attr->write) + goto out_put_module; } /* File needs read support. @@ -410,90 +398,65 @@ static int check_perm(struct inode * inode, struct file * file, int type) */ if (file->f_mode & FMODE_READ) { if (!(inode->i_mode & S_IRUGO)) - goto Eaccess; - + goto out_put_module; if ((type & CONFIGFS_ITEM_ATTR) && !attr->show) - goto Eaccess; - - if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->read) - goto Eaccess; + goto out_put_module; + if ((type & CONFIGFS_ITEM_BIN_ATTR) && !buffer->bin_attr->read) + goto out_put_module; } - /* No error? Great, allocate a buffer for the file, and store it - * it in file->private_data for easy access. - */ - buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL); - if (!buffer) { - error = -ENOMEM; - goto Enomem; - } mutex_init(&buffer->mutex); buffer->needs_read_fill = 1; buffer->read_in_progress = false; buffer->write_in_progress = false; - buffer->ops = ops; file->private_data = buffer; - goto Done; + return 0; - Einval: - error = -EINVAL; - goto Done; - Eaccess: - error = -EACCES; - Enomem: - module_put(attr->ca_owner); - Done: - if (error && item) - config_item_put(item); +out_put_module: + module_put(buffer->owner); +out_put_item: + config_item_put(buffer->item); +out_free_buffer: + kfree(buffer); +out: return error; } static int configfs_release(struct inode *inode, struct file *filp) { - struct config_item * item = to_item(filp->f_path.dentry->d_parent); - struct configfs_attribute * attr = to_attr(filp->f_path.dentry); - struct module * owner = attr->ca_owner; - struct configfs_buffer * buffer = filp->private_data; - - if (item) - config_item_put(item); - /* After this point, attr should not be accessed. */ - module_put(owner); - - if (buffer) { - if (buffer->page) - free_page((unsigned long)buffer->page); - mutex_destroy(&buffer->mutex); - kfree(buffer); - } + struct configfs_buffer *buffer = filp->private_data; + + if (buffer->item) + config_item_put(buffer->item); + module_put(buffer->owner); + if (buffer->page) + free_page((unsigned long)buffer->page); + mutex_destroy(&buffer->mutex); + kfree(buffer); return 0; } static int configfs_open_file(struct inode *inode, struct file *filp) { - return check_perm(inode, filp, CONFIGFS_ITEM_ATTR); + return __configfs_open_file(inode, filp, CONFIGFS_ITEM_ATTR); } static int configfs_open_bin_file(struct inode *inode, struct file *filp) { - return check_perm(inode, filp, CONFIGFS_ITEM_BIN_ATTR); + return __configfs_open_file(inode, filp, CONFIGFS_ITEM_BIN_ATTR); } -static int configfs_release_bin_file(struct inode *inode, struct file *filp) +static int configfs_release_bin_file(struct inode *inode, struct file *file) { - struct configfs_buffer *buffer = filp->private_data; - struct dentry *dentry = filp->f_path.dentry; - struct config_item *item = to_item(dentry->d_parent); - struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); - ssize_t len = 0; - int ret; + struct configfs_buffer *buffer = file->private_data; buffer->read_in_progress = false; if (buffer->write_in_progress) { buffer->write_in_progress = false; - len = bin_attr->write(item, buffer->bin_buffer, + /* result of ->release() is ignored */ + buffer->bin_attr->write(buffer->item, buffer->bin_buffer, buffer->bin_buffer_size); /* vfree on NULL is safe */ @@ -503,10 +466,8 @@ static int configfs_release_bin_file(struct inode *inode, struct file *filp) buffer->needs_read_fill = 1; } - ret = configfs_release(inode, filp); - if (len < 0) - return len; - return ret; + configfs_release(inode, file); + return 0; } From 5f7f9c7cde01202de82814e77b68c819f61a46c6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 29 Aug 2019 23:13:30 -0400 Subject: [PATCH 041/155] configfs_register_group() shouldn't be (and isn't) called in rmdirable parts commit f19e4ed1e1edbfa3c9ccb9fed17759b7d6db24c6 upstream. revert cc57c07343bd "configfs: fix registered group removal" It was an attempt to handle something that fundamentally doesn't work - configfs_register_group() should never be done in a part of tree that can be rmdir'ed. And in mainline it never had been, so let's not borrow trouble; the fix was racy anyway, it would take a lot more to make that work and desired semantics is not clear. Signed-off-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- fs/configfs/dir.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index a1985a9ad2d640..64fdb12e6ad61c 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1782,16 +1782,6 @@ void configfs_unregister_group(struct config_group *group) struct dentry *dentry = group->cg_item.ci_dentry; struct dentry *parent = group->cg_item.ci_parent->ci_dentry; - mutex_lock(&subsys->su_mutex); - if (!group->cg_item.ci_parent->ci_group) { - /* - * The parent has already been unlinked and detached - * due to a rmdir. - */ - goto unlink_group; - } - mutex_unlock(&subsys->su_mutex); - inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); spin_lock(&configfs_dirent_lock); configfs_detach_prep(dentry, NULL); @@ -1806,7 +1796,6 @@ void configfs_unregister_group(struct config_group *group) dput(dentry); mutex_lock(&subsys->su_mutex); -unlink_group: unlink_group(group); mutex_unlock(&subsys->su_mutex); } From 2f41c26ed4f617f768e6d9b82306dbe5bb667d95 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 25 Aug 2019 19:56:13 -0400 Subject: [PATCH 042/155] configfs: new object reprsenting tree fragments commit 47320fbe11a6059ae502c9c16b668022fdb4cf76 upstream. Refcounted, hangs of configfs_dirent, created by operations that add fragments to configfs tree (mkdir and configfs_register_{subsystem,group}). Will be used in the next commit to provide exclusion between fragment removal and ->show/->store calls. Signed-off-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- fs/configfs/configfs_internal.h | 15 ++++- fs/configfs/dir.c | 105 +++++++++++++++++++++++++------- fs/configfs/file.c | 4 +- 3 files changed, 97 insertions(+), 27 deletions(-) diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index ccc31fa6f1a7e1..16eb59adf5aab9 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -34,6 +34,15 @@ #include #include +struct configfs_fragment { + atomic_t frag_count; + struct rw_semaphore frag_sem; + bool frag_dead; +}; + +void put_fragment(struct configfs_fragment *); +struct configfs_fragment *get_fragment(struct configfs_fragment *); + struct configfs_dirent { atomic_t s_count; int s_dependent_count; @@ -48,6 +57,7 @@ struct configfs_dirent { #ifdef CONFIG_LOCKDEP int s_depth; #endif + struct configfs_fragment *s_frag; }; #define CONFIGFS_ROOT 0x0001 @@ -75,8 +85,8 @@ extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct in extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); extern int configfs_create_bin_file(struct config_item *, const struct configfs_bin_attribute *); -extern int configfs_make_dirent(struct configfs_dirent *, - struct dentry *, void *, umode_t, int); +extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *, + void *, umode_t, int, struct configfs_fragment *); extern int configfs_dirent_is_ready(struct configfs_dirent *); extern void configfs_hash_and_remove(struct dentry * dir, const char * name); @@ -151,6 +161,7 @@ static inline void release_configfs_dirent(struct configfs_dirent * sd) { if (!(sd->s_type & CONFIGFS_ROOT)) { kfree(sd->s_iattr); + put_fragment(sd->s_frag); kmem_cache_free(configfs_dir_cachep, sd); } } diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 64fdb12e6ad61c..1362208a8618f4 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -164,11 +164,38 @@ configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd) #endif /* CONFIG_LOCKDEP */ +static struct configfs_fragment *new_fragment(void) +{ + struct configfs_fragment *p; + + p = kmalloc(sizeof(struct configfs_fragment), GFP_KERNEL); + if (p) { + atomic_set(&p->frag_count, 1); + init_rwsem(&p->frag_sem); + p->frag_dead = false; + } + return p; +} + +void put_fragment(struct configfs_fragment *frag) +{ + if (frag && atomic_dec_and_test(&frag->frag_count)) + kfree(frag); +} + +struct configfs_fragment *get_fragment(struct configfs_fragment *frag) +{ + if (likely(frag)) + atomic_inc(&frag->frag_count); + return frag; +} + /* * Allocates a new configfs_dirent and links it to the parent configfs_dirent */ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd, - void *element, int type) + void *element, int type, + struct configfs_fragment *frag) { struct configfs_dirent * sd; @@ -188,6 +215,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *paren kmem_cache_free(configfs_dir_cachep, sd); return ERR_PTR(-ENOENT); } + sd->s_frag = get_fragment(frag); list_add(&sd->s_sibling, &parent_sd->s_children); spin_unlock(&configfs_dirent_lock); @@ -222,11 +250,11 @@ static int configfs_dirent_exists(struct configfs_dirent *parent_sd, int configfs_make_dirent(struct configfs_dirent * parent_sd, struct dentry * dentry, void * element, - umode_t mode, int type) + umode_t mode, int type, struct configfs_fragment *frag) { struct configfs_dirent * sd; - sd = configfs_new_dirent(parent_sd, element, type); + sd = configfs_new_dirent(parent_sd, element, type, frag); if (IS_ERR(sd)) return PTR_ERR(sd); @@ -273,7 +301,8 @@ static void init_symlink(struct inode * inode) * until it is validated by configfs_dir_set_ready() */ -static int configfs_create_dir(struct config_item *item, struct dentry *dentry) +static int configfs_create_dir(struct config_item *item, struct dentry *dentry, + struct configfs_fragment *frag) { int error; umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; @@ -286,7 +315,8 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry) return error; error = configfs_make_dirent(p->d_fsdata, dentry, item, mode, - CONFIGFS_DIR | CONFIGFS_USET_CREATING); + CONFIGFS_DIR | CONFIGFS_USET_CREATING, + frag); if (unlikely(error)) return error; @@ -351,9 +381,10 @@ int configfs_create_link(struct configfs_symlink *sl, { int err = 0; umode_t mode = S_IFLNK | S_IRWXUGO; + struct configfs_dirent *p = parent->d_fsdata; - err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode, - CONFIGFS_ITEM_LINK); + err = configfs_make_dirent(p, dentry, sl, mode, + CONFIGFS_ITEM_LINK, p->s_frag); if (!err) { err = configfs_create(dentry, mode, init_symlink); if (err) { @@ -612,7 +643,8 @@ static int populate_attrs(struct config_item *item) static int configfs_attach_group(struct config_item *parent_item, struct config_item *item, - struct dentry *dentry); + struct dentry *dentry, + struct configfs_fragment *frag); static void configfs_detach_group(struct config_item *item); static void detach_groups(struct config_group *group) @@ -660,7 +692,8 @@ static void detach_groups(struct config_group *group) * try using vfs_mkdir. Just a thought. */ static int create_default_group(struct config_group *parent_group, - struct config_group *group) + struct config_group *group, + struct configfs_fragment *frag) { int ret; struct configfs_dirent *sd; @@ -676,7 +709,7 @@ static int create_default_group(struct config_group *parent_group, d_add(child, NULL); ret = configfs_attach_group(&parent_group->cg_item, - &group->cg_item, child); + &group->cg_item, child, frag); if (!ret) { sd = child->d_fsdata; sd->s_type |= CONFIGFS_USET_DEFAULT; @@ -690,13 +723,14 @@ static int create_default_group(struct config_group *parent_group, return ret; } -static int populate_groups(struct config_group *group) +static int populate_groups(struct config_group *group, + struct configfs_fragment *frag) { struct config_group *new_group; int ret = 0; list_for_each_entry(new_group, &group->default_groups, group_entry) { - ret = create_default_group(group, new_group); + ret = create_default_group(group, new_group, frag); if (ret) { detach_groups(group); break; @@ -810,11 +844,12 @@ static void link_group(struct config_group *parent_group, struct config_group *g */ static int configfs_attach_item(struct config_item *parent_item, struct config_item *item, - struct dentry *dentry) + struct dentry *dentry, + struct configfs_fragment *frag) { int ret; - ret = configfs_create_dir(item, dentry); + ret = configfs_create_dir(item, dentry, frag); if (!ret) { ret = populate_attrs(item); if (ret) { @@ -844,12 +879,13 @@ static void configfs_detach_item(struct config_item *item) static int configfs_attach_group(struct config_item *parent_item, struct config_item *item, - struct dentry *dentry) + struct dentry *dentry, + struct configfs_fragment *frag) { int ret; struct configfs_dirent *sd; - ret = configfs_attach_item(parent_item, item, dentry); + ret = configfs_attach_item(parent_item, item, dentry, frag); if (!ret) { sd = dentry->d_fsdata; sd->s_type |= CONFIGFS_USET_DIR; @@ -865,7 +901,7 @@ static int configfs_attach_group(struct config_item *parent_item, */ inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); configfs_adjust_dir_dirent_depth_before_populate(sd); - ret = populate_groups(to_config_group(item)); + ret = populate_groups(to_config_group(item), frag); if (ret) { configfs_detach_item(item); d_inode(dentry)->i_flags |= S_DEAD; @@ -1260,6 +1296,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode struct configfs_dirent *sd; struct config_item_type *type; struct module *subsys_owner = NULL, *new_item_owner = NULL; + struct configfs_fragment *frag; char *name; sd = dentry->d_parent->d_fsdata; @@ -1278,6 +1315,12 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode goto out; } + frag = new_fragment(); + if (!frag) { + ret = -ENOMEM; + goto out; + } + /* Get a working ref for the duration of this function */ parent_item = configfs_get_config_item(dentry->d_parent); type = parent_item->ci_type; @@ -1380,9 +1423,9 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode spin_unlock(&configfs_dirent_lock); if (group) - ret = configfs_attach_group(parent_item, item, dentry); + ret = configfs_attach_group(parent_item, item, dentry, frag); else - ret = configfs_attach_item(parent_item, item, dentry); + ret = configfs_attach_item(parent_item, item, dentry, frag); spin_lock(&configfs_dirent_lock); sd->s_type &= ~CONFIGFS_USET_IN_MKDIR; @@ -1419,6 +1462,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode * reference. */ config_item_put(parent_item); + put_fragment(frag); out: return ret; @@ -1587,7 +1631,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) */ err = -ENOENT; if (configfs_dirent_is_ready(parent_sd)) { - file->private_data = configfs_new_dirent(parent_sd, NULL, 0); + file->private_data = configfs_new_dirent(parent_sd, NULL, 0, NULL); if (IS_ERR(file->private_data)) err = PTR_ERR(file->private_data); else @@ -1743,8 +1787,13 @@ int configfs_register_group(struct config_group *parent_group, { struct configfs_subsystem *subsys = parent_group->cg_subsys; struct dentry *parent; + struct configfs_fragment *frag; int ret; + frag = new_fragment(); + if (!frag) + return -ENOMEM; + mutex_lock(&subsys->su_mutex); link_group(parent_group, group); mutex_unlock(&subsys->su_mutex); @@ -1752,7 +1801,7 @@ int configfs_register_group(struct config_group *parent_group, parent = parent_group->cg_item.ci_dentry; inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); - ret = create_default_group(parent_group, group); + ret = create_default_group(parent_group, group, frag); if (ret) goto err_out; @@ -1760,12 +1809,14 @@ int configfs_register_group(struct config_group *parent_group, configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata); spin_unlock(&configfs_dirent_lock); inode_unlock(d_inode(parent)); + put_fragment(frag); return 0; err_out: inode_unlock(d_inode(parent)); mutex_lock(&subsys->su_mutex); unlink_group(group); mutex_unlock(&subsys->su_mutex); + put_fragment(frag); return ret; } EXPORT_SYMBOL(configfs_register_group); @@ -1852,10 +1903,17 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) struct dentry *dentry; struct dentry *root; struct configfs_dirent *sd; + struct configfs_fragment *frag; + + frag = new_fragment(); + if (!frag) + return -ENOMEM; root = configfs_pin_fs(); - if (IS_ERR(root)) + if (IS_ERR(root)) { + put_fragment(frag); return PTR_ERR(root); + } if (!group->cg_item.ci_name) group->cg_item.ci_name = group->cg_item.ci_namebuf; @@ -1871,7 +1929,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) d_add(dentry, NULL); err = configfs_attach_group(sd->s_element, &group->cg_item, - dentry); + dentry, frag); if (err) { BUG_ON(d_inode(dentry)); d_drop(dentry); @@ -1889,6 +1947,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) unlink_group(group); configfs_release_fs(); } + put_fragment(frag); return err; } diff --git a/fs/configfs/file.c b/fs/configfs/file.c index c05ffda74a9137..a2b7944db12e62 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -502,7 +502,7 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, - CONFIGFS_ITEM_ATTR); + CONFIGFS_ITEM_ATTR, parent_sd->s_frag); inode_unlock(d_inode(dir)); return error; @@ -524,7 +524,7 @@ int configfs_create_bin_file(struct config_item *item, inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode, - CONFIGFS_ITEM_BIN_ATTR); + CONFIGFS_ITEM_BIN_ATTR, parent_sd->s_frag); inode_unlock(dir->d_inode); return error; From 09e21253d17f53bdb5aac0e0dbd057a29fcbe8d1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 31 Aug 2019 09:43:43 +0200 Subject: [PATCH 043/155] configfs: provide exclusion between IO and removals commit b0841eefd9693827afb9888235e26ddd098f9cef upstream. Make sure that attribute methods are not called after the item has been removed from the tree. To do so, we * at the point of no return in removals, grab ->frag_sem exclusive and mark the fragment dead. * call the methods of attributes with ->frag_sem taken shared and only after having verified that the fragment is still alive. The main benefit is for method instances - they are guaranteed that the objects they are accessing *and* all ancestors are still there. Another win is that we don't need to bother with extra refcount on config_item when opening a file - the item will be alive for as long as it stays in the tree, and we won't touch it/attributes/any associated data after it's been removed from the tree. Signed-off-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- fs/configfs/dir.c | 23 ++++++++++++++ fs/configfs/file.c | 75 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 80 insertions(+), 18 deletions(-) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 1362208a8618f4..c2ef617d2f97db 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1474,6 +1474,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) struct config_item *item; struct configfs_subsystem *subsys; struct configfs_dirent *sd; + struct configfs_fragment *frag; struct module *subsys_owner = NULL, *dead_item_owner = NULL; int ret; @@ -1531,6 +1532,16 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) } } while (ret == -EAGAIN); + frag = sd->s_frag; + if (down_write_killable(&frag->frag_sem)) { + spin_lock(&configfs_dirent_lock); + configfs_detach_rollback(dentry); + spin_unlock(&configfs_dirent_lock); + return -EINTR; + } + frag->frag_dead = true; + up_write(&frag->frag_sem); + /* Get a working ref for the duration of this function */ item = configfs_get_config_item(dentry); @@ -1832,6 +1843,12 @@ void configfs_unregister_group(struct config_group *group) struct configfs_subsystem *subsys = group->cg_subsys; struct dentry *dentry = group->cg_item.ci_dentry; struct dentry *parent = group->cg_item.ci_parent->ci_dentry; + struct configfs_dirent *sd = dentry->d_fsdata; + struct configfs_fragment *frag = sd->s_frag; + + down_write(&frag->frag_sem); + frag->frag_dead = true; + up_write(&frag->frag_sem); inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); spin_lock(&configfs_dirent_lock); @@ -1957,12 +1974,18 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) struct config_group *group = &subsys->su_group; struct dentry *dentry = group->cg_item.ci_dentry; struct dentry *root = dentry->d_sb->s_root; + struct configfs_dirent *sd = dentry->d_fsdata; + struct configfs_fragment *frag = sd->s_frag; if (dentry->d_parent != root) { pr_err("Tried to unregister non-subsystem!\n"); return; } + down_write(&frag->frag_sem); + frag->frag_dead = true; + up_write(&frag->frag_sem); + inode_lock_nested(d_inode(root), I_MUTEX_PARENT); inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); diff --git a/fs/configfs/file.c b/fs/configfs/file.c index a2b7944db12e62..bb0a427517e92c 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -62,22 +62,32 @@ struct configfs_buffer { }; }; +static inline struct configfs_fragment *to_frag(struct file *file) +{ + struct configfs_dirent *sd = file->f_path.dentry->d_fsdata; + + return sd->s_frag; +} -static int fill_read_buffer(struct configfs_buffer * buffer) +static int fill_read_buffer(struct file *file, struct configfs_buffer *buffer) { - ssize_t count; + struct configfs_fragment *frag = to_frag(file); + ssize_t count = -ENOENT; if (!buffer->page) buffer->page = (char *) get_zeroed_page(GFP_KERNEL); if (!buffer->page) return -ENOMEM; - count = buffer->attr->show(buffer->item, buffer->page); + down_read(&frag->frag_sem); + if (!frag->frag_dead) + count = buffer->attr->show(buffer->item, buffer->page); + up_read(&frag->frag_sem); + if (count < 0) return count; if (WARN_ON_ONCE(count > (ssize_t)SIMPLE_ATTR_SIZE)) return -EIO; - buffer->needs_read_fill = 0; buffer->count = count; return 0; @@ -110,7 +120,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp mutex_lock(&buffer->mutex); if (buffer->needs_read_fill) { - retval = fill_read_buffer(buffer); + retval = fill_read_buffer(file, buffer); if (retval) goto out; } @@ -147,6 +157,7 @@ static ssize_t configfs_read_bin_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + struct configfs_fragment *frag = to_frag(file); struct configfs_buffer *buffer = file->private_data; ssize_t retval = 0; ssize_t len = min_t(size_t, count, PAGE_SIZE); @@ -162,7 +173,12 @@ configfs_read_bin_file(struct file *file, char __user *buf, if (buffer->needs_read_fill) { /* perform first read with buf == NULL to get extent */ - len = buffer->bin_attr->read(buffer->item, NULL, 0); + down_read(&frag->frag_sem); + if (!frag->frag_dead) + len = buffer->bin_attr->read(buffer->item, NULL, 0); + else + len = -ENOENT; + up_read(&frag->frag_sem); if (len <= 0) { retval = len; goto out; @@ -182,8 +198,13 @@ configfs_read_bin_file(struct file *file, char __user *buf, buffer->bin_buffer_size = len; /* perform second read to fill buffer */ - len = buffer->bin_attr->read(buffer->item, - buffer->bin_buffer, len); + down_read(&frag->frag_sem); + if (!frag->frag_dead) + len = buffer->bin_attr->read(buffer->item, + buffer->bin_buffer, len); + else + len = -ENOENT; + up_read(&frag->frag_sem); if (len < 0) { retval = len; vfree(buffer->bin_buffer); @@ -234,9 +255,16 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size } static int -flush_write_buffer(struct configfs_buffer *buffer, size_t count) +flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t count) { - return buffer->attr->store(buffer->item, buffer->page, count); + struct configfs_fragment *frag = to_frag(file); + int res = -ENOENT; + + down_read(&frag->frag_sem); + if (!frag->frag_dead) + res = buffer->attr->store(buffer->item, buffer->page, count); + up_read(&frag->frag_sem); + return res; } @@ -266,7 +294,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof mutex_lock(&buffer->mutex); len = fill_write_buffer(buffer, buf, count); if (len > 0) - len = flush_write_buffer(buffer, len); + len = flush_write_buffer(file, buffer, len); if (len > 0) *ppos += len; mutex_unlock(&buffer->mutex); @@ -342,6 +370,7 @@ configfs_write_bin_file(struct file *file, const char __user *buf, static int __configfs_open_file(struct inode *inode, struct file *file, int type) { struct dentry *dentry = file->f_path.dentry; + struct configfs_fragment *frag = to_frag(file); struct configfs_attribute *attr; struct configfs_buffer *buffer; int error; @@ -351,8 +380,13 @@ static int __configfs_open_file(struct inode *inode, struct file *file, int type if (!buffer) goto out; + error = -ENOENT; + down_read(&frag->frag_sem); + if (unlikely(frag->frag_dead)) + goto out_free_buffer; + error = -EINVAL; - buffer->item = configfs_get_config_item(dentry->d_parent); + buffer->item = to_item(dentry->d_parent); if (!buffer->item) goto out_free_buffer; @@ -410,6 +444,7 @@ static int __configfs_open_file(struct inode *inode, struct file *file, int type buffer->read_in_progress = false; buffer->write_in_progress = false; file->private_data = buffer; + up_read(&frag->frag_sem); return 0; out_put_module: @@ -417,6 +452,7 @@ static int __configfs_open_file(struct inode *inode, struct file *file, int type out_put_item: config_item_put(buffer->item); out_free_buffer: + up_read(&frag->frag_sem); kfree(buffer); out: return error; @@ -426,8 +462,6 @@ static int configfs_release(struct inode *inode, struct file *filp) { struct configfs_buffer *buffer = filp->private_data; - if (buffer->item) - config_item_put(buffer->item); module_put(buffer->owner); if (buffer->page) free_page((unsigned long)buffer->page); @@ -453,12 +487,17 @@ static int configfs_release_bin_file(struct inode *inode, struct file *file) buffer->read_in_progress = false; if (buffer->write_in_progress) { + struct configfs_fragment *frag = to_frag(file); buffer->write_in_progress = false; - /* result of ->release() is ignored */ - buffer->bin_attr->write(buffer->item, buffer->bin_buffer, - buffer->bin_buffer_size); - + down_read(&frag->frag_sem); + if (!frag->frag_dead) { + /* result of ->release() is ignored */ + buffer->bin_attr->write(buffer->item, + buffer->bin_buffer, + buffer->bin_buffer_size); + } + up_read(&frag->frag_sem); /* vfree on NULL is safe */ vfree(buffer->bin_buffer); buffer->bin_buffer = NULL; From e49b85dd3c7f2bb13f5bd8dd4869e30a82acbba7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 3 Aug 2019 11:51:18 -0400 Subject: [PATCH 044/155] configfs: fix a deadlock in configfs_symlink() commit 351e5d869e5ac10cb40c78b5f2d7dfc816ad4587 upstream. Configfs abuses symlink(2). Unlike the normal filesystems, it wants the target resolved at symlink(2) time, like link(2) would've done. The problem is that ->symlink() is called with the parent directory locked exclusive, so resolving the target inside the ->symlink() is easily deadlocked. Short of really ugly games in sys_symlink() itself, all we can do is to unlock the parent before resolving the target and relock it after. However, that invalidates the checks done by the caller of ->symlink(), so we have to * check that dentry is still where it used to be (it couldn't have been moved, but it could've been unhashed) * recheck that it's still negative (somebody else might've successfully created a symlink with the same name while we were looking the target up) * recheck the permissions on the parent directory. Cc: stable@vger.kernel.org Signed-off-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- fs/configfs/symlink.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 9993cdb81e7d70..147a6b779ab995 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -157,11 +157,42 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna !type->ct_item_ops->allow_link) goto out_put; + /* + * This is really sick. What they wanted was a hybrid of + * link(2) and symlink(2) - they wanted the target resolved + * at syscall time (as link(2) would've done), be a directory + * (which link(2) would've refused to do) *AND* be a deep + * fucking magic, making the target busy from rmdir POV. + * symlink(2) is nothing of that sort, and the locking it + * gets matches the normal symlink(2) semantics. Without + * attempts to resolve the target (which might very well + * not even exist yet) done prior to locking the parent + * directory. This perversion, OTOH, needs to resolve + * the target, which would lead to obvious deadlocks if + * attempted with any directories locked. + * + * Unfortunately, that garbage is userland ABI and we should've + * said "no" back in 2005. Too late now, so we get to + * play very ugly games with locking. + * + * Try *ANYTHING* of that sort in new code, and you will + * really regret it. Just ask yourself - what could a BOFH + * do to me and do I want to find it out first-hand? + * + * AV, a thoroughly annoyed bastard. + */ + inode_unlock(dir); ret = get_target(symname, &path, &target_item, dentry->d_sb); + inode_lock(dir); if (ret) goto out_put; - ret = type->ct_item_ops->allow_link(parent_item, target_item); + if (dentry->d_inode || d_unhashed(dentry)) + ret = -EEXIST; + else + ret = inode_permission(dir, MAY_WRITE | MAY_EXEC); + if (!ret) + ret = type->ct_item_ops->allow_link(parent_item, target_item); if (!ret) { mutex_lock(&configfs_symlink_mutex); ret = create_link(parent_item, target_item, dentry); From c813b4467a2291d14dd9c90f9542ad55eb1ec315 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Thu, 5 Sep 2019 10:17:45 -0600 Subject: [PATCH 045/155] usb: dwc3: Allow disabling of metastability workaround commit 42bf02ec6e420e541af9a47437d0bdf961ca2972 upstream Some platforms (e.g. TI's DRA7 USB2 instance) have more trouble with the metastability workaround as it supports only a High-Speed PHY and the PHY can enter into an Erratic state [1] when the controller is set in SuperSpeed mode as part of the metastability workaround. This causes upto 2 seconds delay in enumeration on DRA7's USB2 instance in gadget mode. If these platforms can be better off without the workaround, provide a device tree property to suggest that so the workaround is avoided. [1] Device mode enumeration trace showing PHY Erratic Error. irq/90-dwc3-969 [000] d... 52.323145: dwc3_event: event (00000901): Erratic Error [U0] irq/90-dwc3-969 [000] d... 52.560646: dwc3_event: event (00000901): Erratic Error [U0] irq/90-dwc3-969 [000] d... 52.798144: dwc3_event: event (00000901): Erratic Error [U0] Signed-off-by: Roger Quadros Signed-off-by: Felipe Balbi Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/usb/dwc3.txt | 2 ++ drivers/usb/dwc3/core.c | 3 +++ drivers/usb/dwc3/core.h | 3 +++ drivers/usb/dwc3/gadget.c | 6 ++++-- 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/usb/dwc3.txt b/Documentation/devicetree/bindings/usb/dwc3.txt index 52fb41046b342b..44e8bab159adce 100644 --- a/Documentation/devicetree/bindings/usb/dwc3.txt +++ b/Documentation/devicetree/bindings/usb/dwc3.txt @@ -47,6 +47,8 @@ Optional properties: from P0 to P1/P2/P3 without delay. - snps,dis-tx-ipgap-linecheck-quirk: when set, disable u2mac linestate check during HS transmit. + - snps,dis_metastability_quirk: when set, disable metastability workaround. + CAUTION: use only if you are absolutely sure of it. - snps,is-utmi-l1-suspend: true when DWC3 asserts output signal utmi_l1_suspend_n, false when asserts utmi_sleep_n - snps,hird-threshold: HIRD threshold diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index 945330ea8d5c53..9b093978bd24d0 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -1115,6 +1115,9 @@ static void dwc3_get_properties(struct dwc3 *dwc) device_property_read_u32(dev, "snps,quirk-frame-length-adjustment", &dwc->fladj); + dwc->dis_metastability_quirk = device_property_read_bool(dev, + "snps,dis_metastability_quirk"); + dwc->lpm_nyet_threshold = lpm_nyet_threshold; dwc->tx_de_emphasis = tx_de_emphasis; diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index abd1142c9e4d95..40bf0e0768d968 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -869,6 +869,7 @@ struct dwc3_scratchpad_array { * 1 - -3.5dB de-emphasis * 2 - No de-emphasis * 3 - Reserved + * @dis_metastability_quirk: set to disable metastability quirk. * @imod_interval: set the interrupt moderation interval in 250ns * increments or 0 to disable. */ @@ -1025,6 +1026,8 @@ struct dwc3 { unsigned tx_de_emphasis_quirk:1; unsigned tx_de_emphasis:2; + unsigned dis_metastability_quirk:1; + u16 imod_interval; }; diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 1b99d44e52b9ac..5916340c41621c 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -2034,7 +2034,8 @@ static void dwc3_gadget_set_speed(struct usb_gadget *g, * STAR#9000525659: Clock Domain Crossing on DCTL in * USB 2.0 Mode */ - if (dwc->revision < DWC3_REVISION_220A) { + if (dwc->revision < DWC3_REVISION_220A && + !dwc->dis_metastability_quirk) { reg |= DWC3_DCFG_SUPERSPEED; } else { switch (speed) { @@ -3265,7 +3266,8 @@ int dwc3_gadget_init(struct dwc3 *dwc) * is less than super speed because we don't have means, yet, to tell * composite.c that we are USB 2.0 + LPM ECN. */ - if (dwc->revision < DWC3_REVISION_220A) + if (dwc->revision < DWC3_REVISION_220A && + !dwc->dis_metastability_quirk) dev_info(dwc->dev, "changing max_speed on rev %08x\n", dwc->revision); From 1863e7fef65ca61bf10f8fccebc2f123bd5f10f9 Mon Sep 17 00:00:00 2001 From: Keerthy Date: Thu, 5 Sep 2019 10:17:46 -0600 Subject: [PATCH 046/155] mfd: palmas: Assign the right powerhold mask for tps65917 commit 572ff4d560be3784205b224cd67d6715620092d7 upstream The powerhold mask for TPS65917 is different when comapred to the other palmas versions. Hence assign the right mask that enables power off of tps65917 pmic correctly. Signed-off-by: Keerthy Signed-off-by: Lee Jones Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman --- drivers/mfd/palmas.c | 10 +++++++++- include/linux/mfd/palmas.h | 3 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/mfd/palmas.c b/drivers/mfd/palmas.c index 3922a93f9f9242..663a2398b6b1c8 100644 --- a/drivers/mfd/palmas.c +++ b/drivers/mfd/palmas.c @@ -430,6 +430,7 @@ static void palmas_power_off(void) { unsigned int addr; int ret, slave; + u8 powerhold_mask; struct device_node *np = palmas_dev->dev->of_node; if (of_property_read_bool(np, "ti,palmas-override-powerhold")) { @@ -437,8 +438,15 @@ static void palmas_power_off(void) PALMAS_PRIMARY_SECONDARY_PAD2); slave = PALMAS_BASE_TO_SLAVE(PALMAS_PU_PD_OD_BASE); + if (of_device_is_compatible(np, "ti,tps65917")) + powerhold_mask = + TPS65917_PRIMARY_SECONDARY_PAD2_GPIO_5_MASK; + else + powerhold_mask = + PALMAS_PRIMARY_SECONDARY_PAD2_GPIO_7_MASK; + ret = regmap_update_bits(palmas_dev->regmap[slave], addr, - PALMAS_PRIMARY_SECONDARY_PAD2_GPIO_7_MASK, 0); + powerhold_mask, 0); if (ret) dev_err(palmas_dev->dev, "Unable to write PRIMARY_SECONDARY_PAD2 %d\n", diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h index 6dec4382630311..cffb23b8bd70c3 100644 --- a/include/linux/mfd/palmas.h +++ b/include/linux/mfd/palmas.h @@ -3733,6 +3733,9 @@ enum usb_irq_events { #define TPS65917_REGEN3_CTRL_MODE_ACTIVE 0x01 #define TPS65917_REGEN3_CTRL_MODE_ACTIVE_SHIFT 0x00 +/* POWERHOLD Mask field for PRIMARY_SECONDARY_PAD2 register */ +#define TPS65917_PRIMARY_SECONDARY_PAD2_GPIO_5_MASK 0xC + /* Registers for function RESOURCE */ #define TPS65917_REGEN1_CTRL 0x2 #define TPS65917_PLLEN_CTRL 0x3 From c3a387268218a4c5b4d6c7a5d49bdcdc19ade57e Mon Sep 17 00:00:00 2001 From: "Andrew F. Davis" Date: Thu, 5 Sep 2019 10:17:47 -0600 Subject: [PATCH 047/155] ASoC: tlv320aic31xx: Handle inverted BCLK in non-DSP modes commit dcb407b257af06fa58b0544ec01ec9e0d3927e02 upstream Currently BCLK inverting is only handled when the DAI format is DSP, but the BCLK may be inverted in any supported mode. Without this using this CODEC in any other mode than DSP with the BCLK inverted leads to bad sampling timing and very poor audio quality. Signed-off-by: Andrew F. Davis Signed-off-by: Mark Brown Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/tlv320aic31xx.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/sound/soc/codecs/tlv320aic31xx.c b/sound/soc/codecs/tlv320aic31xx.c index 54a87a905eb63a..d3bd0bf15ddb19 100644 --- a/sound/soc/codecs/tlv320aic31xx.c +++ b/sound/soc/codecs/tlv320aic31xx.c @@ -924,6 +924,18 @@ static int aic31xx_set_dai_fmt(struct snd_soc_dai *codec_dai, return -EINVAL; } + /* signal polarity */ + switch (fmt & SND_SOC_DAIFMT_INV_MASK) { + case SND_SOC_DAIFMT_NB_NF: + break; + case SND_SOC_DAIFMT_IB_NF: + iface_reg2 |= AIC31XX_BCLKINV_MASK; + break; + default: + dev_err(codec->dev, "Invalid DAI clock signal polarity\n"); + return -EINVAL; + } + /* interface format */ switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) { case SND_SOC_DAIFMT_I2S: @@ -931,16 +943,12 @@ static int aic31xx_set_dai_fmt(struct snd_soc_dai *codec_dai, case SND_SOC_DAIFMT_DSP_A: dsp_a_val = 0x1; case SND_SOC_DAIFMT_DSP_B: - /* NOTE: BCLKINV bit value 1 equas NB and 0 equals IB */ - switch (fmt & SND_SOC_DAIFMT_INV_MASK) { - case SND_SOC_DAIFMT_NB_NF: - iface_reg2 |= AIC31XX_BCLKINV_MASK; - break; - case SND_SOC_DAIFMT_IB_NF: - break; - default: - return -EINVAL; - } + /* + * NOTE: This CODEC samples on the falling edge of BCLK in + * DSP mode, this is inverted compared to what most DAIs + * expect, so we invert for this mode + */ + iface_reg2 ^= AIC31XX_BCLKINV_MASK; iface_reg1 |= (AIC31XX_DSP_MODE << AIC31XX_IFACE1_DATATYPE_SHIFT); break; From 6442c97b5d846bf5ef6987e0ba5d35d31e58bc29 Mon Sep 17 00:00:00 2001 From: Roman Yeryomin Date: Thu, 5 Sep 2019 10:17:48 -0600 Subject: [PATCH 048/155] mtd: spi-nor: enable 4B opcodes for mx66l51235l commit d342b6a973af459f6104cad6effc8efc71a0558d upstream Signed-off-by: Roman Yeryomin Signed-off-by: Cyrille Pitchen Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/spi-nor/spi-nor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index 34ecc12ee3d93b..6c013341ef0921 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -1030,7 +1030,7 @@ static const struct flash_info spi_nor_ids[] = { { "mx25l25635e", INFO(0xc22019, 0, 64 * 1024, 512, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) }, { "mx25u25635f", INFO(0xc22539, 0, 64 * 1024, 512, SECT_4K | SPI_NOR_4B_OPCODES) }, { "mx25l25655e", INFO(0xc22619, 0, 64 * 1024, 512, 0) }, - { "mx66l51235l", INFO(0xc2201a, 0, 64 * 1024, 1024, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) }, + { "mx66l51235l", INFO(0xc2201a, 0, 64 * 1024, 1024, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ | SPI_NOR_4B_OPCODES) }, { "mx66u51235f", INFO(0xc2253a, 0, 64 * 1024, 1024, SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ | SPI_NOR_4B_OPCODES) }, { "mx66l1g45g", INFO(0xc2201b, 0, 64 * 1024, 2048, SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) }, { "mx66l1g55g", INFO(0xc2261b, 0, 64 * 1024, 2048, SPI_NOR_QUAD_READ) }, From cb47222af149a0959d2536b0868ea6fab59bee5b Mon Sep 17 00:00:00 2001 From: Vignesh R Date: Thu, 5 Sep 2019 10:17:49 -0600 Subject: [PATCH 049/155] mtd: spi-nor: cadence-quadspi: add a delay in write sequence commit 61dc8493bae9ba82a1c72edbc6c6065f6a94456a upstream As per 66AK2G02 TRM[1] SPRUHY8F section 11.15.5.3 Indirect Access Controller programming sequence, a delay equal to couple of QSPI master clock(~5ns) is required after setting CQSPI_REG_INDIRECTWR_START bit and writing data to the flash. Introduce a quirk flag CQSPI_NEEDS_WR_DELAY to handle this and set this flag for TI 66AK2G SoC. [1]http://www.ti.com/lit/ug/spruhy8f/spruhy8f.pdf Signed-off-by: Vignesh R Acked-by: Marek Vasut Signed-off-by: Cyrille Pitchen Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/spi-nor/cadence-quadspi.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/spi-nor/cadence-quadspi.c b/drivers/mtd/spi-nor/cadence-quadspi.c index f22dd34f4f8342..ff4edf4bb23c5f 100644 --- a/drivers/mtd/spi-nor/cadence-quadspi.c +++ b/drivers/mtd/spi-nor/cadence-quadspi.c @@ -38,6 +38,9 @@ #define CQSPI_NAME "cadence-qspi" #define CQSPI_MAX_CHIPSELECT 16 +/* Quirks */ +#define CQSPI_NEEDS_WR_DELAY BIT(0) + struct cqspi_st; struct cqspi_flash_pdata { @@ -76,6 +79,7 @@ struct cqspi_st { u32 fifo_depth; u32 fifo_width; u32 trigger_address; + u32 wr_delay; struct cqspi_flash_pdata f_pdata[CQSPI_MAX_CHIPSELECT]; }; @@ -623,6 +627,15 @@ static int cqspi_indirect_write_execute(struct spi_nor *nor, reinit_completion(&cqspi->transfer_complete); writel(CQSPI_REG_INDIRECTWR_START_MASK, reg_base + CQSPI_REG_INDIRECTWR); + /* + * As per 66AK2G02 TRM SPRUHY8F section 11.15.5.3 Indirect Access + * Controller programming sequence, couple of cycles of + * QSPI_REF_CLK delay is required for the above bit to + * be internally synchronized by the QSPI module. Provide 5 + * cycles of delay. + */ + if (cqspi->wr_delay) + ndelay(cqspi->wr_delay); while (remaining > 0) { size_t write_words, mod_bytes; @@ -1184,6 +1197,7 @@ static int cqspi_probe(struct platform_device *pdev) struct cqspi_st *cqspi; struct resource *res; struct resource *res_ahb; + unsigned long data; int ret; int irq; @@ -1241,6 +1255,10 @@ static int cqspi_probe(struct platform_device *pdev) } cqspi->master_ref_clk_hz = clk_get_rate(cqspi->clk); + data = (unsigned long)of_device_get_match_data(dev); + if (data & CQSPI_NEEDS_WR_DELAY) + cqspi->wr_delay = 5 * DIV_ROUND_UP(NSEC_PER_SEC, + cqspi->master_ref_clk_hz); ret = devm_request_irq(dev, irq, cqspi_irq_handler, 0, pdev->name, cqspi); @@ -1312,7 +1330,14 @@ static const struct dev_pm_ops cqspi__dev_pm_ops = { #endif static const struct of_device_id cqspi_dt_ids[] = { - {.compatible = "cdns,qspi-nor",}, + { + .compatible = "cdns,qspi-nor", + .data = (void *)0, + }, + { + .compatible = "ti,k2g-qspi", + .data = (void *)CQSPI_NEEDS_WR_DELAY, + }, { /* end of table */ } }; From e8cff9cabe821dc20d4aa7b1c95719bfae656ca8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 5 Sep 2019 10:17:50 -0600 Subject: [PATCH 050/155] misc: pci_endpoint_test: Prevent some integer overflows commit 378f79cab12b669928f3a4037f023837ead2ce0c upstream "size + max" can have an arithmetic overflow when we're allocating: orig_src_addr = dma_alloc_coherent(dev, size + alignment, ... I've added a few checks to prevent that. Fixes: 13107c60681f ("misc: pci_endpoint_test: Add support to provide aligned buffer addresses") Signed-off-by: Dan Carpenter Signed-off-by: Greg Kroah-Hartman Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman --- drivers/misc/pci_endpoint_test.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c index 9849bf18329953..504fa680825df9 100644 --- a/drivers/misc/pci_endpoint_test.c +++ b/drivers/misc/pci_endpoint_test.c @@ -226,6 +226,9 @@ static bool pci_endpoint_test_copy(struct pci_endpoint_test *test, size_t size) u32 src_crc32; u32 dst_crc32; + if (size > SIZE_MAX - alignment) + goto err; + orig_src_addr = dma_alloc_coherent(dev, size + alignment, &orig_src_phys_addr, GFP_KERNEL); if (!orig_src_addr) { @@ -311,6 +314,9 @@ static bool pci_endpoint_test_write(struct pci_endpoint_test *test, size_t size) size_t alignment = test->alignment; u32 crc32; + if (size > SIZE_MAX - alignment) + goto err; + orig_addr = dma_alloc_coherent(dev, size + alignment, &orig_phys_addr, GFP_KERNEL); if (!orig_addr) { @@ -369,6 +375,9 @@ static bool pci_endpoint_test_read(struct pci_endpoint_test *test, size_t size) size_t alignment = test->alignment; u32 crc32; + if (size > SIZE_MAX - alignment) + goto err; + orig_addr = dma_alloc_coherent(dev, size + alignment, &orig_phys_addr, GFP_KERNEL); if (!orig_addr) { From 354d644056094689337ecb6dd29b30ee7cf3a23f Mon Sep 17 00:00:00 2001 From: Keerthy Date: Thu, 5 Sep 2019 10:17:51 -0600 Subject: [PATCH 051/155] PCI: dra7xx: Add shutdown handler to cleanly turn off clocks commit 9c049bea083fea21373b8baf51fe49acbe24e105 upstream Add shutdown handler to cleanly turn off clocks. This will help in cases of kexec where in a new kernel can boot abruptly. Signed-off-by: Keerthy Signed-off-by: Bjorn Helgaas Acked-by: Kishon Vijay Abraham I Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman --- drivers/pci/dwc/pci-dra7xx.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/pci/dwc/pci-dra7xx.c b/drivers/pci/dwc/pci-dra7xx.c index 7f5dfa169d0f42..2e0d0b29cdcbe8 100644 --- a/drivers/pci/dwc/pci-dra7xx.c +++ b/drivers/pci/dwc/pci-dra7xx.c @@ -817,6 +817,22 @@ static int dra7xx_pcie_resume_noirq(struct device *dev) } #endif +void dra7xx_pcie_shutdown(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct dra7xx_pcie *dra7xx = dev_get_drvdata(dev); + int ret; + + dra7xx_pcie_stop_link(dra7xx->pci); + + ret = pm_runtime_put_sync(dev); + if (ret < 0) + dev_dbg(dev, "pm_runtime_put_sync failed\n"); + + pm_runtime_disable(dev); + dra7xx_pcie_disable_phy(dra7xx); +} + static const struct dev_pm_ops dra7xx_pcie_pm_ops = { SET_SYSTEM_SLEEP_PM_OPS(dra7xx_pcie_suspend, dra7xx_pcie_resume) SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(dra7xx_pcie_suspend_noirq, @@ -830,5 +846,6 @@ static struct platform_driver dra7xx_pcie_driver = { .suppress_bind_attrs = true, .pm = &dra7xx_pcie_pm_ops, }, + .shutdown = dra7xx_pcie_shutdown, }; builtin_platform_driver_probe(dra7xx_pcie_driver, dra7xx_pcie_probe); From 3f2beab56a2b6c5eb91d8fb089efef1635c6c059 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 5 Sep 2019 10:17:52 -0600 Subject: [PATCH 052/155] misc: pci_endpoint_test: Fix BUG_ON error during pci_disable_msi() commit b7636e816adcb52bc96b6fb7bc9d141cbfd17ddb upstream pci_disable_msi() throws a Kernel BUG if the driver has successfully requested an IRQ and not released it. Fix it here by freeing IRQs before invoking pci_disable_msi(). Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Bjorn Helgaas Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman --- drivers/misc/pci_endpoint_test.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c index 504fa680825df9..230f1e8538dc9c 100644 --- a/drivers/misc/pci_endpoint_test.c +++ b/drivers/misc/pci_endpoint_test.c @@ -92,6 +92,7 @@ struct pci_endpoint_test { void __iomem *bar[6]; struct completion irq_raised; int last_irq; + int num_irqs; /* mutex to protect the ioctls */ struct mutex mutex; struct miscdevice miscdev; @@ -514,6 +515,7 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev, irq = pci_alloc_irq_vectors(pdev, 1, 32, PCI_IRQ_MSI); if (irq < 0) dev_err(dev, "failed to get MSI interrupts\n"); + test->num_irqs = irq; } err = devm_request_irq(dev, pdev->irq, pci_endpoint_test_irqhandler, @@ -581,6 +583,9 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev, pci_iounmap(pdev, test->bar[bar]); } + for (i = 0; i < irq; i++) + devm_free_irq(dev, pdev->irq + i, test); + err_disable_msi: pci_disable_msi(pdev); pci_release_regions(pdev); @@ -594,6 +599,7 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev, static void pci_endpoint_test_remove(struct pci_dev *pdev) { int id; + int i; enum pci_barno bar; struct pci_endpoint_test *test = pci_get_drvdata(pdev); struct miscdevice *misc_device = &test->miscdev; @@ -609,6 +615,8 @@ static void pci_endpoint_test_remove(struct pci_dev *pdev) if (test->bar[bar]) pci_iounmap(pdev, test->bar[bar]); } + for (i = 0; i < test->num_irqs; i++) + devm_free_irq(&pdev->dev, pdev->irq + i, test); pci_disable_msi(pdev); pci_release_regions(pdev); pci_disable_device(pdev); From e36f4a7c8822fd5bcb95a4aecfd92f6129be1a5c Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 5 Sep 2019 10:17:53 -0600 Subject: [PATCH 053/155] mailbox: reset txdone_method TXDONE_BY_POLL if client knows_txdone commit 33cd7123ac0ba5360656ae27db453de5b9aa711f upstream Currently the mailbox framework sets txdone_method to TXDONE_BY_POLL if the controller sets txdone_by_poll. However some clients can have a mechanism to do TXDONE_BY_ACK which they can specify by knows_txdone. However, we endup setting both TXDONE_BY_POLL and TXDONE_BY_ACK in that case. In such scenario, we may end up with below warnings as the tx ticker is run both by mailbox framework and the client. WARNING: CPU: 1 PID: 0 at kernel/time/hrtimer.c:805 hrtimer_forward+0x88/0xd8 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.12.0-rc5 #242 Hardware name: ARM LTD ARM Juno Development Platform task: ffff8009768ca700 task.stack: ffff8009768f8000 PC is at hrtimer_forward+0x88/0xd8 LR is at txdone_hrtimer+0xd4/0xf8 Call trace: hrtimer_forward+0x88/0xd8 __hrtimer_run_queues+0xe4/0x158 hrtimer_interrupt+0xa4/0x220 arch_timer_handler_phys+0x30/0x40 handle_percpu_devid_irq+0x78/0x130 generic_handle_irq+0x24/0x38 __handle_domain_irq+0x5c/0xb8 gic_handle_irq+0x54/0xa8 This patch fixes the issue by resetting TXDONE_BY_POLL if client has set knows_txdone. Cc: Alexey Klimov Signed-off-by: Sudeep Holla Signed-off-by: Jassi Brar Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman --- drivers/mailbox/mailbox.c | 4 ++-- drivers/mailbox/pcc.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c index 44b49a2676f03f..055c90b8253cbe 100644 --- a/drivers/mailbox/mailbox.c +++ b/drivers/mailbox/mailbox.c @@ -351,7 +351,7 @@ struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index) init_completion(&chan->tx_complete); if (chan->txdone_method == TXDONE_BY_POLL && cl->knows_txdone) - chan->txdone_method |= TXDONE_BY_ACK; + chan->txdone_method = TXDONE_BY_ACK; spin_unlock_irqrestore(&chan->lock, flags); @@ -420,7 +420,7 @@ void mbox_free_channel(struct mbox_chan *chan) spin_lock_irqsave(&chan->lock, flags); chan->cl = NULL; chan->active_req = NULL; - if (chan->txdone_method == (TXDONE_BY_POLL | TXDONE_BY_ACK)) + if (chan->txdone_method == TXDONE_BY_ACK) chan->txdone_method = TXDONE_BY_POLL; module_put(chan->mbox->dev->driver->owner); diff --git a/drivers/mailbox/pcc.c b/drivers/mailbox/pcc.c index 9b7005e1345e1a..27c2294be51afe 100644 --- a/drivers/mailbox/pcc.c +++ b/drivers/mailbox/pcc.c @@ -266,7 +266,7 @@ struct mbox_chan *pcc_mbox_request_channel(struct mbox_client *cl, init_completion(&chan->tx_complete); if (chan->txdone_method == TXDONE_BY_POLL && cl->knows_txdone) - chan->txdone_method |= TXDONE_BY_ACK; + chan->txdone_method = TXDONE_BY_ACK; spin_unlock_irqrestore(&chan->lock, flags); @@ -312,7 +312,7 @@ void pcc_mbox_free_channel(struct mbox_chan *chan) spin_lock_irqsave(&chan->lock, flags); chan->cl = NULL; chan->active_req = NULL; - if (chan->txdone_method == (TXDONE_BY_POLL | TXDONE_BY_ACK)) + if (chan->txdone_method == TXDONE_BY_ACK) chan->txdone_method = TXDONE_BY_POLL; spin_unlock_irqrestore(&chan->lock, flags); From a0128f369072548e4f3a0e9862a268fedf32c618 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 5 Sep 2019 10:17:54 -0600 Subject: [PATCH 054/155] ASoC: tlv320dac31xx: mark expected switch fall-through commit 09fc38c1af4cb888255e9ecf267bf9757c12885d upstream In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Addresses-Coverity-ID: 1195220 Signed-off-by: Gustavo A. R. Silva Signed-off-by: Mark Brown Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/tlv320aic31xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/tlv320aic31xx.c b/sound/soc/codecs/tlv320aic31xx.c index d3bd0bf15ddb19..cc95c15ceceb1a 100644 --- a/sound/soc/codecs/tlv320aic31xx.c +++ b/sound/soc/codecs/tlv320aic31xx.c @@ -941,7 +941,7 @@ static int aic31xx_set_dai_fmt(struct snd_soc_dai *codec_dai, case SND_SOC_DAIFMT_I2S: break; case SND_SOC_DAIFMT_DSP_A: - dsp_a_val = 0x1; + dsp_a_val = 0x1; /* fall through */ case SND_SOC_DAIFMT_DSP_B: /* * NOTE: This CODEC samples on the falling edge of BCLK in From 6a65aa55cb0ec892c40ab907af83452f7dea2a4a Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 5 Sep 2019 10:17:55 -0600 Subject: [PATCH 055/155] ASoC: davinci-mcasp: Handle return value of devm_kasprintf commit 0c8b794c4a10aaf7ac0d4a49be2b2638e2038adb upstream devm_kasprintf() can fail here and we must check its return value. Signed-off-by: Arvind Yadav Acked-by: Peter Ujfalusi Signed-off-by: Mark Brown Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman --- sound/soc/davinci/davinci-mcasp.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sound/soc/davinci/davinci-mcasp.c b/sound/soc/davinci/davinci-mcasp.c index 0480ec4c803541..af6cd8b874f5c1 100644 --- a/sound/soc/davinci/davinci-mcasp.c +++ b/sound/soc/davinci/davinci-mcasp.c @@ -1894,6 +1894,10 @@ static int davinci_mcasp_probe(struct platform_device *pdev) if (irq >= 0) { irq_name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "%s_common", dev_name(&pdev->dev)); + if (!irq_name) { + ret = -ENOMEM; + goto err; + } ret = devm_request_threaded_irq(&pdev->dev, irq, NULL, davinci_mcasp_common_irq_handler, IRQF_ONESHOT | IRQF_SHARED, @@ -1911,6 +1915,10 @@ static int davinci_mcasp_probe(struct platform_device *pdev) if (irq >= 0) { irq_name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "%s_rx", dev_name(&pdev->dev)); + if (!irq_name) { + ret = -ENOMEM; + goto err; + } ret = devm_request_threaded_irq(&pdev->dev, irq, NULL, davinci_mcasp_rx_irq_handler, IRQF_ONESHOT, irq_name, mcasp); @@ -1926,6 +1934,10 @@ static int davinci_mcasp_probe(struct platform_device *pdev) if (irq >= 0) { irq_name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "%s_tx", dev_name(&pdev->dev)); + if (!irq_name) { + ret = -ENOMEM; + goto err; + } ret = devm_request_threaded_irq(&pdev->dev, irq, NULL, davinci_mcasp_tx_irq_handler, IRQF_ONESHOT, irq_name, mcasp); From d74abcff60ea82b15fee3c2298ba6cf761c4ad6c Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 5 Sep 2019 10:17:56 -0600 Subject: [PATCH 056/155] ASoC: davinci: Kill BUG_ON() usage commit befff4fbc27e19b14b343eb4a65d8f75d38b6230 upstream Don't use BUG_ON() for a non-critical sanity check on production systems. This patch replaces with a softer WARN_ON() and an error path. Signed-off-by: Takashi Iwai Signed-off-by: Mark Brown Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman --- sound/soc/davinci/davinci-mcasp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/davinci/davinci-mcasp.c b/sound/soc/davinci/davinci-mcasp.c index af6cd8b874f5c1..b4e6f4a04cb6cf 100644 --- a/sound/soc/davinci/davinci-mcasp.c +++ b/sound/soc/davinci/davinci-mcasp.c @@ -1748,7 +1748,8 @@ static int davinci_mcasp_get_dma_type(struct davinci_mcasp *mcasp) PTR_ERR(chan)); return PTR_ERR(chan); } - BUG_ON(!chan->device || !chan->device->dev); + if (WARN_ON(!chan->device || !chan->device->dev)) + return -EINVAL; if (chan->device->dev->of_node) ret = of_property_read_string(chan->device->dev->of_node, From f5eca5cfb3a48e1699574ab3f8df2304f355758d Mon Sep 17 00:00:00 2001 From: Christophe Jaillet Date: Thu, 5 Sep 2019 10:17:57 -0600 Subject: [PATCH 057/155] ASoC: davinci-mcasp: Fix an error handling path in 'davinci_mcasp_probe()' commit 1b8b68b05d1868404316d32e20782b00442aba90 upstream All error handling paths in this function 'goto err' except this one. If one of the 2 previous memory allocations fails, we should go through the existing error handling path. Otherwise there is an unbalanced pm_runtime_enable()/pm_runtime_disable(). Fixes: dd55ff8346a9 ("ASoC: davinci-mcasp: Add set_tdm_slots() support") Signed-off-by: Christophe JAILLET Acked-by: Peter Ujfalusi Signed-off-by: Mark Brown Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman --- sound/soc/davinci/davinci-mcasp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/soc/davinci/davinci-mcasp.c b/sound/soc/davinci/davinci-mcasp.c index b4e6f4a04cb6cf..07bac9ea65c41a 100644 --- a/sound/soc/davinci/davinci-mcasp.c +++ b/sound/soc/davinci/davinci-mcasp.c @@ -2022,8 +2022,10 @@ static int davinci_mcasp_probe(struct platform_device *pdev) GFP_KERNEL); if (!mcasp->chconstr[SNDRV_PCM_STREAM_PLAYBACK].list || - !mcasp->chconstr[SNDRV_PCM_STREAM_CAPTURE].list) - return -ENOMEM; + !mcasp->chconstr[SNDRV_PCM_STREAM_CAPTURE].list) { + ret = -ENOMEM; + goto err; + } ret = davinci_mcasp_set_ch_constraints(mcasp); if (ret) From dcb86e921dc7e5c2ed524404ceee77175bca5727 Mon Sep 17 00:00:00 2001 From: Claudio Foellmi Date: Thu, 5 Sep 2019 10:17:58 -0600 Subject: [PATCH 058/155] i2c: omap: Trigger bus recovery in lockup case commit 93367bfca98f36cece57c01dbce6ea1b4ac58245 upstream A very conservative check for bus activity (to prevent interference in multimaster setups) prevented the bus recovery methods from being triggered in the case that SDA or SCL was stuck low. This defeats the purpose of the recovery mechanism, which was introduced for exactly this situation (a slave device keeping SDA pulled down). Also added a check to make sure SDA is low before attempting recovery. If SDA is not stuck low, recovery will not help, so we can skip it. Note that bus lockups can persist across reboots. The only other options are to reset or power cycle the offending slave device, and many i2c slaves do not even have a reset pin. If we see that one of the lines is low for the entire timeout duration, we can actually be sure that there is no other master driving the bus. It is therefore save for us to attempt a bus recovery. Signed-off-by: Claudio Foellmi Tested-by: Vignesh R Reviewed-by: Grygorii Strashko [wsa: fixed one return code to -EBUSY] Signed-off-by: Wolfram Sang Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-omap.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c index 12ba183693d692..a03564f41ad0a2 100644 --- a/drivers/i2c/busses/i2c-omap.c +++ b/drivers/i2c/busses/i2c-omap.c @@ -486,6 +486,22 @@ static int omap_i2c_init(struct omap_i2c_dev *omap) return 0; } +/* + * Try bus recovery, but only if SDA is actually low. + */ +static int omap_i2c_recover_bus(struct omap_i2c_dev *omap) +{ + u16 systest; + + systest = omap_i2c_read_reg(omap, OMAP_I2C_SYSTEST_REG); + if ((systest & OMAP_I2C_SYSTEST_SCL_I_FUNC) && + (systest & OMAP_I2C_SYSTEST_SDA_I_FUNC)) + return 0; /* bus seems to already be fine */ + if (!(systest & OMAP_I2C_SYSTEST_SCL_I_FUNC)) + return -EBUSY; /* recovery would not fix SCL */ + return i2c_recover_bus(&omap->adapter); +} + /* * Waiting on Bus Busy */ @@ -496,7 +512,7 @@ static int omap_i2c_wait_for_bb(struct omap_i2c_dev *omap) timeout = jiffies + OMAP_I2C_TIMEOUT; while (omap_i2c_read_reg(omap, OMAP_I2C_STAT_REG) & OMAP_I2C_STAT_BB) { if (time_after(jiffies, timeout)) - return i2c_recover_bus(&omap->adapter); + return omap_i2c_recover_bus(omap); msleep(1); } @@ -577,8 +593,13 @@ static int omap_i2c_wait_for_bb_valid(struct omap_i2c_dev *omap) } if (time_after(jiffies, timeout)) { + /* + * SDA or SCL were low for the entire timeout without + * any activity detected. Most likely, a slave is + * locking up the bus with no master driving the clock. + */ dev_warn(omap->dev, "timeout waiting for bus ready\n"); - return -ETIMEDOUT; + return omap_i2c_recover_bus(omap); } msleep(1); From 185c5fa6b2ad67b811c1886cc5b8013a04747a87 Mon Sep 17 00:00:00 2001 From: Zumeng Chen Date: Thu, 5 Sep 2019 10:17:59 -0600 Subject: [PATCH 059/155] cpufreq: ti-cpufreq: add missing of_node_put() commit 248aefdcc3a7e0cfbd014946b4dead63e750e71b upstream call of_node_put to release the refcount of np. Signed-off-by: Zumeng Chen Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman --- drivers/cpufreq/ti-cpufreq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c index 4bf47de6101fac..cadc324bedb48a 100644 --- a/drivers/cpufreq/ti-cpufreq.c +++ b/drivers/cpufreq/ti-cpufreq.c @@ -205,6 +205,7 @@ static int ti_cpufreq_init(void) np = of_find_node_by_path("/"); match = of_match_node(ti_cpufreq_of_match, np); + of_node_put(np); if (!match) return -ENODEV; From 508806ef60b5101e74868a723b34f2083071231e Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Tue, 31 Oct 2017 15:26:00 +0200 Subject: [PATCH 060/155] ARM: dts: dra7: Disable USB metastability workaround for USB2 commit b8c9c6fa2002b8fd4a9710f76f80f99c6046d48c upstream. The metastability workaround causes Erratic errors [1] on the HighSpeed USB PHY which can cause upto 2 seconds delay in enumerating to a USB host while in Gadget mode. Disable the Run/Stop metastability workaround to avoid this ill effect. We are aware that this opens up the opportunity for Run/Stop metastability, however this issue has never been observed in TI releases so we think that Run/Stop metastability is a lesser evil than the PHY Erratic errors. So disable it. [1] USB controller trace during gadget enumeration irq/90-dwc3-969 [000] d... 52.323145: dwc3_event: event (00000901): Erratic Error [U0] irq/90-dwc3-969 [000] d... 52.560646: dwc3_event: event (00000901): Erratic Error [U0] irq/90-dwc3-969 [000] d... 52.798144: dwc3_event: event (00000901): Erratic Error [U0] Signed-off-by: Roger Quadros Acked-by: Felipe Balbi Signed-off-by: Tony Lindgren Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/dra7.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi index 0bf354024ef55c..09686d73f94792 100644 --- a/arch/arm/boot/dts/dra7.dtsi +++ b/arch/arm/boot/dts/dra7.dtsi @@ -1540,6 +1540,7 @@ dr_mode = "otg"; snps,dis_u3_susphy_quirk; snps,dis_u2_susphy_quirk; + snps,dis_metastability_quirk; }; }; From 3dec71e388f95382d83ebb5589f0016eac4a6d2b Mon Sep 17 00:00:00 2001 From: Dave Chiluk Date: Tue, 23 Jul 2019 11:44:26 -0500 Subject: [PATCH 061/155] sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices commit de53fd7aedb100f03e5d2231cfce0e4993282425 upstream. It has been observed, that highly-threaded, non-cpu-bound applications running under cpu.cfs_quota_us constraints can hit a high percentage of periods throttled while simultaneously not consuming the allocated amount of quota. This use case is typical of user-interactive non-cpu bound applications, such as those running in kubernetes or mesos when run on multiple cpu cores. This has been root caused to cpu-local run queue being allocated per cpu bandwidth slices, and then not fully using that slice within the period. At which point the slice and quota expires. This expiration of unused slice results in applications not being able to utilize the quota for which they are allocated. The non-expiration of per-cpu slices was recently fixed by 'commit 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition")'. Prior to that it appears that this had been broken since at least 'commit 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period")' which was introduced in v3.16-rc1 in 2014. That added the following conditional which resulted in slices never being expired. if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; Because this was broken for nearly 5 years, and has recently been fixed and is now being noticed by many users running kubernetes (https://github.com/kubernetes/kubernetes/issues/67577) it is my opinion that the mechanisms around expiring runtime should be removed altogether. This allows quota already allocated to per-cpu run-queues to live longer than the period boundary. This allows threads on runqueues that do not use much CPU to continue to use their remaining slice over a longer period of time than cpu.cfs_period_us. However, this helps prevent the above condition of hitting throttling while also not fully utilizing your cpu quota. This theoretically allows a machine to use slightly more than its allotted quota in some periods. This overflow would be bounded by the remaining quota left on each per-cpu runqueueu. This is typically no more than min_cfs_rq_runtime=1ms per cpu. For CPU bound tasks this will change nothing, as they should theoretically fully utilize all of their quota in each period. For user-interactive tasks as described above this provides a much better user/application experience as their cpu utilization will more closely match the amount they requested when they hit throttling. This means that cpu limits no longer strictly apply per period for non-cpu bound applications, but that they are still accurate over longer timeframes. This greatly improves performance of high-thread-count, non-cpu bound applications with low cfs_quota_us allocation on high-core-count machines. In the case of an artificial testcase (10ms/100ms of quota on 80 CPU machine), this commit resulted in almost 30x performance improvement, while still maintaining correct cpu quota restrictions. That testcase is available at https://github.com/indeedeng/fibtest. Fixes: 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition") Signed-off-by: Dave Chiluk Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Ben Segall Cc: Ingo Molnar Cc: John Hammond Cc: Jonathan Corbet Cc: Kyle Anderson Cc: Gabriel Munos Cc: Peter Oskolkov Cc: Cong Wang Cc: Brendan Gregg Link: https://lkml.kernel.org/r/1563900266-19734-2-git-send-email-chiluk+linux@indeed.com Signed-off-by: Greg Kroah-Hartman --- Documentation/scheduler/sched-bwc.txt | 45 +++++++++++++++++ kernel/sched/fair.c | 70 +++------------------------ kernel/sched/sched.h | 4 -- 3 files changed, 52 insertions(+), 67 deletions(-) diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt index f6b1873f68abc6..de583fbbfe4238 100644 --- a/Documentation/scheduler/sched-bwc.txt +++ b/Documentation/scheduler/sched-bwc.txt @@ -90,6 +90,51 @@ There are two ways in which a group may become throttled: In case b) above, even though the child may have runtime remaining it will not be allowed to until the parent's runtime is refreshed. +CFS Bandwidth Quota Caveats +--------------------------- +Once a slice is assigned to a cpu it does not expire. However all but 1ms of +the slice may be returned to the global pool if all threads on that cpu become +unrunnable. This is configured at compile time by the min_cfs_rq_runtime +variable. This is a performance tweak that helps prevent added contention on +the global lock. + +The fact that cpu-local slices do not expire results in some interesting corner +cases that should be understood. + +For cgroup cpu constrained applications that are cpu limited this is a +relatively moot point because they will naturally consume the entirety of their +quota as well as the entirety of each cpu-local slice in each period. As a +result it is expected that nr_periods roughly equal nr_throttled, and that +cpuacct.usage will increase roughly equal to cfs_quota_us in each period. + +For highly-threaded, non-cpu bound applications this non-expiration nuance +allows applications to briefly burst past their quota limits by the amount of +unused slice on each cpu that the task group is running on (typically at most +1ms per cpu or as defined by min_cfs_rq_runtime). This slight burst only +applies if quota had been assigned to a cpu and then not fully used or returned +in previous periods. This burst amount will not be transferred between cores. +As a result, this mechanism still strictly limits the task group to quota +average usage, albeit over a longer time window than a single period. This +also limits the burst ability to no more than 1ms per cpu. This provides +better more predictable user experience for highly threaded applications with +small quota limits on high core count machines. It also eliminates the +propensity to throttle these applications while simultanously using less than +quota amounts of cpu. Another way to say this, is that by allowing the unused +portion of a slice to remain valid across periods we have decreased the +possibility of wastefully expiring quota on cpu-local silos that don't need a +full slice's amount of cpu time. + +The interaction between cpu-bound and non-cpu-bound-interactive applications +should also be considered, especially when single core usage hits 100%. If you +gave each of these applications half of a cpu-core and they both got scheduled +on the same CPU it is theoretically possible that the non-cpu bound application +will use up to 1ms additional quota in some periods, thereby preventing the +cpu-bound application from fully using its quota by that same amount. In these +instances it will be up to the CFS algorithm (see sched-design-CFS.rst) to +decide which application is chosen to run, as they will both be runnable and +have remaining quota. This runtime discrepancy will be made up in the following +periods when the interactive application idles. + Examples -------- 1. Limit a group to 1 CPU worth of runtime. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 55a33009f9a545..d5c032ec193d6c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4106,8 +4106,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) now = sched_clock_cpu(smp_processor_id()); cfs_b->runtime = cfs_b->quota; - cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); - cfs_b->expires_seq++; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4129,8 +4127,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount, expires; - int expires_seq; + u64 amount = 0, min_amount; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -4147,61 +4144,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - expires_seq = cfs_b->expires_seq; - expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; - /* - * we may have advanced our local expiration to account for allowed - * spread between our sched_clock and the one on which runtime was - * issued. - */ - if (cfs_rq->expires_seq != expires_seq) { - cfs_rq->expires_seq = expires_seq; - cfs_rq->runtime_expires = expires; - } return cfs_rq->runtime_remaining > 0; } -/* - * Note: This depends on the synchronization provided by sched_clock and the - * fact that rq->clock snapshots this value. - */ -static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) - return; - - if (cfs_rq->runtime_remaining < 0) - return; - - /* - * If the local deadline has passed we have to consider the - * possibility that our sched_clock is 'fast' and the global deadline - * has not truly expired. - * - * Fortunately we can check determine whether this the case by checking - * whether the global deadline(cfs_b->expires_seq) has advanced. - */ - if (cfs_rq->expires_seq == cfs_b->expires_seq) { - /* extend local deadline, drift is bounded above by 2 ticks */ - cfs_rq->runtime_expires += TICK_NSEC; - } else { - /* global deadline is ahead, expiration has passed */ - cfs_rq->runtime_remaining = 0; - } -} - static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; - expire_cfs_rq_runtime(cfs_rq); if (likely(cfs_rq->runtime_remaining > 0)) return; @@ -4387,8 +4340,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, - u64 remaining, u64 expires) +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) { struct cfs_rq *cfs_rq; u64 runtime; @@ -4413,7 +4365,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, remaining -= runtime; cfs_rq->runtime_remaining += runtime; - cfs_rq->runtime_expires = expires; /* we check whether we're throttled above */ if (cfs_rq->runtime_remaining > 0) @@ -4438,7 +4389,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { - u64 runtime, runtime_expires; + u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -4466,8 +4417,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - runtime_expires = cfs_b->runtime_expires; - /* * This check is repeated as we are holding onto the new bandwidth while * we unthrottle. This can potentially race with an unthrottled group @@ -4480,8 +4429,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) cfs_b->distribute_running = 1; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime, - runtime_expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock(&cfs_b->lock); cfs_b->distribute_running = 0; @@ -4558,8 +4506,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) return; raw_spin_lock(&cfs_b->lock); - if (cfs_b->quota != RUNTIME_INF && - cfs_rq->runtime_expires == cfs_b->runtime_expires) { + if (cfs_b->quota != RUNTIME_INF) { cfs_b->runtime += slack_runtime; /* we are under rq->lock, defer unthrottling using a timer */ @@ -4591,7 +4538,6 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); - u64 expires; /* confirm we're still not at a refresh boundary */ raw_spin_lock(&cfs_b->lock); @@ -4608,7 +4554,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - expires = cfs_b->runtime_expires; if (runtime) cfs_b->distribute_running = 1; @@ -4617,11 +4562,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime, expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock(&cfs_b->lock); - if (expires == cfs_b->runtime_expires) - cfs_b->runtime -= min(runtime, cfs_b->runtime); + cfs_b->runtime -= min(runtime, cfs_b->runtime); cfs_b->distribute_running = 0; raw_spin_unlock(&cfs_b->lock); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 452b56923c6dfd..268f560ec99867 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -280,8 +280,6 @@ struct cfs_bandwidth { ktime_t period; u64 quota, runtime; s64 hierarchical_quota; - u64 runtime_expires; - int expires_seq; short idle, period_active; struct hrtimer period_timer, slack_timer; @@ -489,8 +487,6 @@ struct cfs_rq { #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; - int expires_seq; - u64 runtime_expires; s64 runtime_remaining; u64 throttled_clock, throttled_clock_task; From da73a178efaccdfffe440047471cc4d40320a811 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 20 Aug 2019 14:40:55 -0400 Subject: [PATCH 062/155] sched/fair: Fix -Wunused-but-set-variable warnings commit 763a9ec06c409dcde2a761aac4bb83ff3938e0b3 upstream. Commit: de53fd7aedb1 ("sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices") introduced a few compilation warnings: kernel/sched/fair.c: In function '__refill_cfs_bandwidth_runtime': kernel/sched/fair.c:4365:6: warning: variable 'now' set but not used [-Wunused-but-set-variable] kernel/sched/fair.c: In function 'start_cfs_bandwidth': kernel/sched/fair.c:4992:6: warning: variable 'overrun' set but not used [-Wunused-but-set-variable] Also, __refill_cfs_bandwidth_runtime() does no longer update the expiration time, so fix the comments accordingly. Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Reviewed-by: Dave Chiluk Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: pauld@redhat.com Fixes: de53fd7aedb1 ("sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices") Link: https://lkml.kernel.org/r/1566326455-8038-1-git-send-email-cai@lca.pw Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/fair.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d5c032ec193d6c..feeb52880d3533 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4091,21 +4091,16 @@ static inline u64 sched_cfs_bandwidth_slice(void) } /* - * Replenish runtime according to assigned quota and update expiration time. - * We use sched_clock_cpu directly instead of rq->clock to avoid adding - * additional synchronization around rq->lock. + * Replenish runtime according to assigned quota. We use sched_clock_cpu + * directly instead of rq->clock to avoid adding additional synchronization + * around rq->lock. * * requires cfs_b->lock */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - u64 now; - - if (cfs_b->quota == RUNTIME_INF) - return; - - now = sched_clock_cpu(smp_processor_id()); - cfs_b->runtime = cfs_b->quota; + if (cfs_b->quota != RUNTIME_INF) + cfs_b->runtime = cfs_b->quota; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) From 75644ed66fd2985c7119309c1c48090736e71851 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 5 Jan 2018 08:26:46 -0800 Subject: [PATCH 063/155] lib/scatterlist: Introduce sgl_alloc() and sgl_free() commit e80a0af4759a164214f02da157a3800753ce135f upstream. Many kernel drivers contain code that allocates and frees both a scatterlist and the pages that populate that scatterlist. Introduce functions in lib/scatterlist.c that perform these tasks instead of duplicating this functionality in multiple drivers. Only include these functions in the build if CONFIG_SGL_ALLOC=y to avoid that the kernel size increases if this functionality is not used. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- include/linux/scatterlist.h | 10 ++++ lib/Kconfig | 4 ++ lib/scatterlist.c | 105 ++++++++++++++++++++++++++++++++++++ 3 files changed, 119 insertions(+) diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index d87dfa41142d4d..8b7bce207229cb 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -267,6 +267,16 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, unsigned long offset, unsigned long size, gfp_t gfp_mask); +#ifdef CONFIG_SGL_ALLOC +struct scatterlist *sgl_alloc_order(unsigned long long length, + unsigned int order, bool chainable, + gfp_t gfp, unsigned int *nent_p); +struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp, + unsigned int *nent_p); +void sgl_free_order(struct scatterlist *sgl, int order); +void sgl_free(struct scatterlist *sgl); +#endif /* CONFIG_SGL_ALLOC */ + size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip, bool to_buffer); diff --git a/lib/Kconfig b/lib/Kconfig index b1445b22a6def4..8396c4cfa1aba3 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -413,6 +413,10 @@ config HAS_DMA depends on !NO_DMA default y +config SGL_ALLOC + bool + default n + config DMA_NOOP_OPS bool depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT) diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 355f2e90b72c83..11fce289d11667 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -433,6 +433,111 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, } EXPORT_SYMBOL(sg_alloc_table_from_pages); +#ifdef CONFIG_SGL_ALLOC + +/** + * sgl_alloc_order - allocate a scatterlist and its pages + * @length: Length in bytes of the scatterlist. Must be at least one + * @order: Second argument for alloc_pages() + * @chainable: Whether or not to allocate an extra element in the scatterlist + * for scatterlist chaining purposes + * @gfp: Memory allocation flags + * @nent_p: [out] Number of entries in the scatterlist that have pages + * + * Returns: A pointer to an initialized scatterlist or %NULL upon failure. + */ +struct scatterlist *sgl_alloc_order(unsigned long long length, + unsigned int order, bool chainable, + gfp_t gfp, unsigned int *nent_p) +{ + struct scatterlist *sgl, *sg; + struct page *page; + unsigned int nent, nalloc; + u32 elem_len; + + nent = round_up(length, PAGE_SIZE << order) >> (PAGE_SHIFT + order); + /* Check for integer overflow */ + if (length > (nent << (PAGE_SHIFT + order))) + return NULL; + nalloc = nent; + if (chainable) { + /* Check for integer overflow */ + if (nalloc + 1 < nalloc) + return NULL; + nalloc++; + } + sgl = kmalloc_array(nalloc, sizeof(struct scatterlist), + (gfp & ~GFP_DMA) | __GFP_ZERO); + if (!sgl) + return NULL; + + sg_init_table(sgl, nent); + sg = sgl; + while (length) { + elem_len = min_t(u64, length, PAGE_SIZE << order); + page = alloc_pages(gfp, order); + if (!page) { + sgl_free(sgl); + return NULL; + } + + sg_set_page(sg, page, elem_len, 0); + length -= elem_len; + sg = sg_next(sg); + } + WARN_ON_ONCE(sg); + if (nent_p) + *nent_p = nent; + return sgl; +} +EXPORT_SYMBOL(sgl_alloc_order); + +/** + * sgl_alloc - allocate a scatterlist and its pages + * @length: Length in bytes of the scatterlist + * @gfp: Memory allocation flags + * @nent_p: [out] Number of entries in the scatterlist + * + * Returns: A pointer to an initialized scatterlist or %NULL upon failure. + */ +struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp, + unsigned int *nent_p) +{ + return sgl_alloc_order(length, 0, false, gfp, nent_p); +} +EXPORT_SYMBOL(sgl_alloc); + +/** + * sgl_free_order - free a scatterlist and its pages + * @sgl: Scatterlist with one or more elements + * @order: Second argument for __free_pages() + */ +void sgl_free_order(struct scatterlist *sgl, int order) +{ + struct scatterlist *sg; + struct page *page; + + for (sg = sgl; sg; sg = sg_next(sg)) { + page = sg_page(sg); + if (page) + __free_pages(page, order); + } + kfree(sgl); +} +EXPORT_SYMBOL(sgl_free_order); + +/** + * sgl_free - free a scatterlist and its pages + * @sgl: Scatterlist with one or more elements + */ +void sgl_free(struct scatterlist *sgl) +{ + sgl_free_order(sgl, 0); +} +EXPORT_SYMBOL(sgl_free); + +#endif /* CONFIG_SGL_ALLOC */ + void __sg_page_iter_start(struct sg_page_iter *piter, struct scatterlist *sglist, unsigned int nents, unsigned long pgoffset) From 21e952b4495a5bfb7f65467dc2e36393782a0fa8 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 24 Jan 2019 14:46:42 -0700 Subject: [PATCH 064/155] usbip: Fix vhci_urb_enqueue() URB null transfer buffer error path commit 2c904963b1dd2acd4bc785b6c72e10a6283c2081 upstream. Fix vhci_urb_enqueue() to print debug msg and return error instead of failing with BUG_ON. Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/vhci_hcd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c index 84e2d7edaa5c7b..14b17ea81deb87 100644 --- a/drivers/usb/usbip/vhci_hcd.c +++ b/drivers/usb/usbip/vhci_hcd.c @@ -716,8 +716,10 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag } vdev = &vhci_hcd->vdev[portnum-1]; - /* patch to usb_sg_init() is in 2.5.60 */ - BUG_ON(!urb->transfer_buffer && urb->transfer_buffer_length); + if (!urb->transfer_buffer && urb->transfer_buffer_length) { + dev_dbg(dev, "Null URB transfer buffer\n"); + return -EINVAL; + } spin_lock_irqsave(&vhci->lock, flags); From a422263cee459e50be15d6a93a299e5503e849df Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 15 Dec 2017 10:05:15 -0700 Subject: [PATCH 065/155] usbip: stub_rx: fix static checker warning on unnecessary checks commit 10c90120930628e8b959bf58d4a0aaef3ae5d945 upstream. Fix the following static checker warnings: The patch c6688ef9f297: "usbip: fix stub_rx: harden CMD_SUBMIT path to handle malicious input" from Dec 7, 2017, leads to the following static checker warning: drivers/usb/usbip/stub_rx.c:346 get_pipe() warn: impossible condition '(pdu->u.cmd_submit.transfer_buffer_length > ((~0 >> 1))) => (s32min-s32max > s32max)' drivers/usb/usbip/stub_rx.c:486 stub_recv_cmd_submit() warn: always true condition '(pdu->u.cmd_submit.transfer_buffer_length <= ((~0 >> 1))) => (s32min-s32max <= s32max)' Reported-by: Dan Carpenter Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/stub_rx.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c index 777a4058c407be..d47176f9c31031 100644 --- a/drivers/usb/usbip/stub_rx.c +++ b/drivers/usb/usbip/stub_rx.c @@ -353,14 +353,6 @@ static int get_pipe(struct stub_device *sdev, struct usbip_header *pdu) epd = &ep->desc; - /* validate transfer_buffer_length */ - if (pdu->u.cmd_submit.transfer_buffer_length > INT_MAX) { - dev_err(&sdev->udev->dev, - "CMD_SUBMIT: -EMSGSIZE transfer_buffer_length %d\n", - pdu->u.cmd_submit.transfer_buffer_length); - return -1; - } - if (usb_endpoint_xfer_control(epd)) { if (dir == USBIP_DIR_OUT) return usb_sndctrlpipe(udev, epnum); @@ -487,8 +479,7 @@ static void stub_recv_cmd_submit(struct stub_device *sdev, } /* allocate urb transfer buffer, if needed */ - if (pdu->u.cmd_submit.transfer_buffer_length > 0 && - pdu->u.cmd_submit.transfer_buffer_length <= INT_MAX) { + if (pdu->u.cmd_submit.transfer_buffer_length > 0) { priv->urb->transfer_buffer = kzalloc(pdu->u.cmd_submit.transfer_buffer_length, GFP_KERNEL); From cf05a68eba4016cd8497022d6e6c3b19666fbcc4 Mon Sep 17 00:00:00 2001 From: Suwan Kim Date: Wed, 28 Aug 2019 12:27:41 +0900 Subject: [PATCH 066/155] usbip: Implement SG support to vhci-hcd and stub driver commit ea44d190764b4422af4d1c29eaeb9e69e353b406 upstream. There are bugs on vhci with usb 3.0 storage device. In USB, each SG list entry buffer should be divisible by the bulk max packet size. But with native SG support, this problem doesn't matter because the SG buffer is treated as contiguous buffer. But without native SG support, USB storage driver breaks SG list into several URBs and the error occurs because of a buffer size of URB that cannot be divided by the bulk max packet size. The error situation is as follows. When USB Storage driver requests 31.5 KB data and has SG list which has 3584 bytes buffer followed by 7 4096 bytes buffer for some reason. USB Storage driver splits this SG list into several URBs because VHCI doesn't support SG and sends them separately. So the first URB buffer size is 3584 bytes. When receiving data from device, USB 3.0 device sends data packet of 1024 bytes size because the max packet size of BULK pipe is 1024 bytes. So device sends 4096 bytes. But the first URB buffer has only 3584 bytes buffer size. So host controller terminates the transfer even though there is more data to receive. So, vhci needs to support SG transfer to prevent this error. In this patch, vhci supports SG regardless of whether the server's host controller supports SG or not, because stub driver splits SG list into several URBs if the server's host controller doesn't support SG. To support SG, vhci sets URB_DMA_MAP_SG flag in urb->transfer_flags if URB has SG list and this flag will tell stub driver to use SG list. After receiving urb from stub driver, vhci clear URB_DMA_MAP_SG flag to avoid unnecessary DMA unmapping in HCD. vhci sends each SG list entry to stub driver. Then, stub driver sees the total length of the buffer and allocates SG table and pages according to the total buffer length calling sgl_alloc(). After stub driver receives completed URB, it again sends each SG list entry to vhci. If the server's host controller doesn't support SG, stub driver breaks a single SG request into several URBs and submits them to the server's host controller. When all the split URBs are completed, stub driver reassembles the URBs into a single return command and sends it to vhci. Moreover, in the situation where vhci supports SG, but stub driver does not, or vice versa, usbip works normally. Because there is no protocol modification, there is no problem in communication between server and client even if the one has a kernel without SG support. In the case of vhci supports SG and stub driver doesn't, because vhci sends only the total length of the buffer to stub driver as it did before the patch applied, stub driver only needs to allocate the required length of buffers using only kmalloc() regardless of whether vhci supports SG or not. But stub driver has to allocate buffer with kmalloc() as much as the total length of SG buffer which is quite huge when vhci sends SG request, so it has overhead in buffer allocation in this situation. If stub driver needs to send data buffer to vhci because of IN pipe, stub driver also sends only total length of buffer as metadata and then sends real data as vhci does. Then vhci receive data from stub driver and store it to the corresponding buffer of SG list entry. And for the case of stub driver supports SG and vhci doesn't, since the USB storage driver checks that vhci doesn't support SG and sends the request to stub driver by splitting the SG list into multiple URBs, stub driver allocates a buffer for each URB with kmalloc() as it did before this patch. * Test environment Test uses two difference machines and two different kernel version to make mismatch situation between the client and the server where vhci supports SG, but stub driver does not, or vice versa. All tests are conducted in both full SG support that both vhci and stub support SG and half SG support that is the mismatch situation. Test kernel version is 5.3-rc6 with commit "usb: add a HCD_DMA flag instead of guestimating DMA capabilities" to avoid unnecessary DMA mapping and unmapping. - Test kernel version - 5.3-rc6 with SG support - 5.1.20-200.fc29.x86_64 without SG support * SG support test - Test devices - Super-speed storage device - SanDisk Ultra USB 3.0 - High-speed storage device - SMI corporation USB 2.0 flash drive - Test description Test read and write operation of mass storage device that uses the BULK transfer. In test, the client reads and writes files whose size is over 1G and it works normally. * Regression test - Test devices - Super-speed device - Logitech Brio webcam - High-speed device - Logitech C920 HD Pro webcam - Full-speed device - Logitech bluetooth mouse - Britz BR-Orion speaker - Low-speed device - Logitech wired mouse - Test description Moving and click test for mouse. To test the webcam, use gnome-cheese. To test the speaker, play music and video on the client. All works normally. * VUDC compatibility test VUDC also works well with this patch. Tests are done with two USB gadget created by CONFIGFS USB gadget. Both use the BULK pipe. 1. Serial gadget 2. Mass storage gadget - Serial gadget test Serial gadget on the host sends and receives data using cat command on the /dev/ttyGS. The client uses minicom to communicate with the serial gadget. - Mass storage gadget test After connecting the gadget with vhci, use "dd" to test read and write operation on the client side. Read - dd if=/dev/sd iflag=direct of=/dev/null bs=1G count=1 Write - dd if= iflag=direct of=/dev/sd bs=1G count=1 Signed-off-by: Suwan Kim Acked-by: Shuah khan Link: https://lore.kernel.org/r/20190828032741.12234-1-suwan.kim027@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/Kconfig | 1 + drivers/usb/usbip/stub.h | 7 +- drivers/usb/usbip/stub_main.c | 57 ++++++--- drivers/usb/usbip/stub_rx.c | 204 ++++++++++++++++++++++--------- drivers/usb/usbip/stub_tx.c | 99 +++++++++++---- drivers/usb/usbip/usbip_common.c | 59 ++++++--- drivers/usb/usbip/vhci_hcd.c | 12 +- drivers/usb/usbip/vhci_rx.c | 3 + drivers/usb/usbip/vhci_tx.c | 66 ++++++++-- 9 files changed, 381 insertions(+), 127 deletions(-) diff --git a/drivers/usb/usbip/Kconfig b/drivers/usb/usbip/Kconfig index a20b65cb6678f9..8276a20ecea7e4 100644 --- a/drivers/usb/usbip/Kconfig +++ b/drivers/usb/usbip/Kconfig @@ -2,6 +2,7 @@ config USBIP_CORE tristate "USB/IP support" depends on NET select USB_COMMON + select SGL_ALLOC ---help--- This enables pushing USB packets over IP to allow remote machines direct access to USB devices. It provides the diff --git a/drivers/usb/usbip/stub.h b/drivers/usb/usbip/stub.h index 84c0599b45b7bc..d9d14d87594992 100644 --- a/drivers/usb/usbip/stub.h +++ b/drivers/usb/usbip/stub.h @@ -66,7 +66,11 @@ struct stub_priv { unsigned long seqnum; struct list_head list; struct stub_device *sdev; - struct urb *urb; + struct urb **urbs; + struct scatterlist *sgl; + int num_urbs; + int completed_urbs; + int urb_status; int unlinking; }; @@ -100,6 +104,7 @@ extern struct usb_device_driver stub_driver; struct bus_id_priv *get_busid_priv(const char *busid); void put_busid_priv(struct bus_id_priv *bid); int del_match_busid(char *busid); +void stub_free_priv_and_urb(struct stub_priv *priv); void stub_device_cleanup_urbs(struct stub_device *sdev); /* stub_rx.c */ diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c index 108dd65fbfbc58..2dc662cf06943e 100644 --- a/drivers/usb/usbip/stub_main.c +++ b/drivers/usb/usbip/stub_main.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "usbip_common.h" #include "stub.h" @@ -297,13 +298,49 @@ static struct stub_priv *stub_priv_pop_from_listhead(struct list_head *listhead) struct stub_priv *priv, *tmp; list_for_each_entry_safe(priv, tmp, listhead, list) { - list_del(&priv->list); + list_del_init(&priv->list); return priv; } return NULL; } +void stub_free_priv_and_urb(struct stub_priv *priv) +{ + struct urb *urb; + int i; + + for (i = 0; i < priv->num_urbs; i++) { + urb = priv->urbs[i]; + + if (!urb) + return; + + kfree(urb->setup_packet); + urb->setup_packet = NULL; + + + if (urb->transfer_buffer && !priv->sgl) { + kfree(urb->transfer_buffer); + urb->transfer_buffer = NULL; + } + + if (urb->num_sgs) { + sgl_free(urb->sg); + urb->sg = NULL; + urb->num_sgs = 0; + } + + usb_free_urb(urb); + } + if (!list_empty(&priv->list)) + list_del(&priv->list); + if (priv->sgl) + sgl_free(priv->sgl); + kfree(priv->urbs); + kmem_cache_free(stub_priv_cache, priv); +} + static struct stub_priv *stub_priv_pop(struct stub_device *sdev) { unsigned long flags; @@ -330,25 +367,15 @@ static struct stub_priv *stub_priv_pop(struct stub_device *sdev) void stub_device_cleanup_urbs(struct stub_device *sdev) { struct stub_priv *priv; - struct urb *urb; + int i; dev_dbg(&sdev->udev->dev, "Stub device cleaning up urbs\n"); while ((priv = stub_priv_pop(sdev))) { - urb = priv->urb; - dev_dbg(&sdev->udev->dev, "free urb seqnum %lu\n", - priv->seqnum); - usb_kill_urb(urb); - - kmem_cache_free(stub_priv_cache, priv); + for (i = 0; i < priv->num_urbs; i++) + usb_kill_urb(priv->urbs[i]); - kfree(urb->transfer_buffer); - urb->transfer_buffer = NULL; - - kfree(urb->setup_packet); - urb->setup_packet = NULL; - - usb_free_urb(urb); + stub_free_priv_and_urb(priv); } } diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c index d47176f9c31031..8812d3edade153 100644 --- a/drivers/usb/usbip/stub_rx.c +++ b/drivers/usb/usbip/stub_rx.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "usbip_common.h" #include "stub.h" @@ -215,7 +216,7 @@ static void tweak_special_requests(struct urb *urb) static int stub_recv_cmd_unlink(struct stub_device *sdev, struct usbip_header *pdu) { - int ret; + int ret, i; unsigned long flags; struct stub_priv *priv; @@ -260,12 +261,14 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev, * so a driver in a client host will know the failure * of the unlink request ? */ - ret = usb_unlink_urb(priv->urb); - if (ret != -EINPROGRESS) - dev_err(&priv->urb->dev->dev, - "failed to unlink a urb # %lu, ret %d\n", - priv->seqnum, ret); - + for (i = priv->completed_urbs; i < priv->num_urbs; i++) { + ret = usb_unlink_urb(priv->urbs[i]); + if (ret != -EINPROGRESS) + dev_err(&priv->urbs[i]->dev->dev, + "failed to unlink %d/%d urb of seqnum %lu, ret %d\n", + i + 1, priv->num_urbs, + priv->seqnum, ret); + } return 0; } @@ -450,14 +453,36 @@ static void masking_bogus_flags(struct urb *urb) urb->transfer_flags &= allowed; } +static int stub_recv_xbuff(struct usbip_device *ud, struct stub_priv *priv) +{ + int ret; + int i; + + for (i = 0; i < priv->num_urbs; i++) { + ret = usbip_recv_xbuff(ud, priv->urbs[i]); + if (ret < 0) + break; + } + + return ret; +} + static void stub_recv_cmd_submit(struct stub_device *sdev, struct usbip_header *pdu) { - int ret; struct stub_priv *priv; struct usbip_device *ud = &sdev->ud; struct usb_device *udev = sdev->udev; + struct scatterlist *sgl = NULL, *sg; + void *buffer = NULL; + unsigned long long buf_len; + int nents; + int num_urbs = 1; int pipe = get_pipe(sdev, pdu); + int use_sg = pdu->u.cmd_submit.transfer_flags & URB_DMA_MAP_SG; + int support_sg = 1; + int np = 0; + int ret, i; if (pipe == -1) return; @@ -466,76 +491,139 @@ static void stub_recv_cmd_submit(struct stub_device *sdev, if (!priv) return; - /* setup a urb */ - if (usb_pipeisoc(pipe)) - priv->urb = usb_alloc_urb(pdu->u.cmd_submit.number_of_packets, - GFP_KERNEL); - else - priv->urb = usb_alloc_urb(0, GFP_KERNEL); + buf_len = (unsigned long long)pdu->u.cmd_submit.transfer_buffer_length; - if (!priv->urb) { - usbip_event_add(ud, SDEV_EVENT_ERROR_MALLOC); - return; + /* allocate urb transfer buffer, if needed */ + if (buf_len) { + if (use_sg) { + sgl = sgl_alloc(buf_len, GFP_KERNEL, &nents); + if (!sgl) + goto err_malloc; + } else { + buffer = kzalloc(buf_len, GFP_KERNEL); + if (!buffer) + goto err_malloc; + } } - /* allocate urb transfer buffer, if needed */ - if (pdu->u.cmd_submit.transfer_buffer_length > 0) { - priv->urb->transfer_buffer = - kzalloc(pdu->u.cmd_submit.transfer_buffer_length, - GFP_KERNEL); - if (!priv->urb->transfer_buffer) { + /* Check if the server's HCD supports SG */ + if (use_sg && !udev->bus->sg_tablesize) { + /* + * If the server's HCD doesn't support SG, break a single SG + * request into several URBs and map each SG list entry to + * corresponding URB buffer. The previously allocated SG + * list is stored in priv->sgl (If the server's HCD support SG, + * SG list is stored only in urb->sg) and it is used as an + * indicator that the server split single SG request into + * several URBs. Later, priv->sgl is used by stub_complete() and + * stub_send_ret_submit() to reassemble the divied URBs. + */ + support_sg = 0; + num_urbs = nents; + priv->completed_urbs = 0; + pdu->u.cmd_submit.transfer_flags &= ~URB_DMA_MAP_SG; + } + + /* allocate urb array */ + priv->num_urbs = num_urbs; + priv->urbs = kmalloc_array(num_urbs, sizeof(*priv->urbs), GFP_KERNEL); + if (!priv->urbs) + goto err_urbs; + + /* setup a urb */ + if (support_sg) { + if (usb_pipeisoc(pipe)) + np = pdu->u.cmd_submit.number_of_packets; + + priv->urbs[0] = usb_alloc_urb(np, GFP_KERNEL); + if (!priv->urbs[0]) + goto err_urb; + + if (buf_len) { + if (use_sg) { + priv->urbs[0]->sg = sgl; + priv->urbs[0]->num_sgs = nents; + priv->urbs[0]->transfer_buffer = NULL; + } else { + priv->urbs[0]->transfer_buffer = buffer; + } + } + + /* copy urb setup packet */ + priv->urbs[0]->setup_packet = kmemdup(&pdu->u.cmd_submit.setup, + 8, GFP_KERNEL); + if (!priv->urbs[0]->setup_packet) { usbip_event_add(ud, SDEV_EVENT_ERROR_MALLOC); return; } - } - /* copy urb setup packet */ - priv->urb->setup_packet = kmemdup(&pdu->u.cmd_submit.setup, 8, - GFP_KERNEL); - if (!priv->urb->setup_packet) { - dev_err(&udev->dev, "allocate setup_packet\n"); - usbip_event_add(ud, SDEV_EVENT_ERROR_MALLOC); - return; + usbip_pack_pdu(pdu, priv->urbs[0], USBIP_CMD_SUBMIT, 0); + } else { + for_each_sg(sgl, sg, nents, i) { + priv->urbs[i] = usb_alloc_urb(0, GFP_KERNEL); + /* The URBs which is previously allocated will be freed + * in stub_device_cleanup_urbs() if error occurs. + */ + if (!priv->urbs[i]) + goto err_urb; + + usbip_pack_pdu(pdu, priv->urbs[i], USBIP_CMD_SUBMIT, 0); + priv->urbs[i]->transfer_buffer = sg_virt(sg); + priv->urbs[i]->transfer_buffer_length = sg->length; + } + priv->sgl = sgl; } - /* set other members from the base header of pdu */ - priv->urb->context = (void *) priv; - priv->urb->dev = udev; - priv->urb->pipe = pipe; - priv->urb->complete = stub_complete; + for (i = 0; i < num_urbs; i++) { + /* set other members from the base header of pdu */ + priv->urbs[i]->context = (void *) priv; + priv->urbs[i]->dev = udev; + priv->urbs[i]->pipe = pipe; + priv->urbs[i]->complete = stub_complete; - usbip_pack_pdu(pdu, priv->urb, USBIP_CMD_SUBMIT, 0); + /* no need to submit an intercepted request, but harmless? */ + tweak_special_requests(priv->urbs[i]); + masking_bogus_flags(priv->urbs[i]); + } - if (usbip_recv_xbuff(ud, priv->urb) < 0) + if (stub_recv_xbuff(ud, priv) < 0) return; - if (usbip_recv_iso(ud, priv->urb) < 0) + if (usbip_recv_iso(ud, priv->urbs[0]) < 0) return; - /* no need to submit an intercepted request, but harmless? */ - tweak_special_requests(priv->urb); - - masking_bogus_flags(priv->urb); /* urb is now ready to submit */ - ret = usb_submit_urb(priv->urb, GFP_KERNEL); - - if (ret == 0) - usbip_dbg_stub_rx("submit urb ok, seqnum %u\n", - pdu->base.seqnum); - else { - dev_err(&udev->dev, "submit_urb error, %d\n", ret); - usbip_dump_header(pdu); - usbip_dump_urb(priv->urb); - - /* - * Pessimistic. - * This connection will be discarded. - */ - usbip_event_add(ud, SDEV_EVENT_ERROR_SUBMIT); + for (i = 0; i < priv->num_urbs; i++) { + ret = usb_submit_urb(priv->urbs[i], GFP_KERNEL); + + if (ret == 0) + usbip_dbg_stub_rx("submit urb ok, seqnum %u\n", + pdu->base.seqnum); + else { + dev_err(&udev->dev, "submit_urb error, %d\n", ret); + usbip_dump_header(pdu); + usbip_dump_urb(priv->urbs[i]); + + /* + * Pessimistic. + * This connection will be discarded. + */ + usbip_event_add(ud, SDEV_EVENT_ERROR_SUBMIT); + break; + } } usbip_dbg_stub_rx("Leave\n"); + return; + +err_urb: + kfree(priv->urbs); +err_urbs: + kfree(buffer); + sgl_free(sgl); +err_malloc: + usbip_event_add(ud, SDEV_EVENT_ERROR_MALLOC); } /* recv a pdu */ diff --git a/drivers/usb/usbip/stub_tx.c b/drivers/usb/usbip/stub_tx.c index 96aa375b80d9cc..45c34a37432e07 100644 --- a/drivers/usb/usbip/stub_tx.c +++ b/drivers/usb/usbip/stub_tx.c @@ -19,25 +19,11 @@ #include #include +#include #include "usbip_common.h" #include "stub.h" -static void stub_free_priv_and_urb(struct stub_priv *priv) -{ - struct urb *urb = priv->urb; - - kfree(urb->setup_packet); - urb->setup_packet = NULL; - - kfree(urb->transfer_buffer); - urb->transfer_buffer = NULL; - - list_del(&priv->list); - kmem_cache_free(stub_priv_cache, priv); - usb_free_urb(urb); -} - /* be in spin_lock_irqsave(&sdev->priv_lock, flags) */ void stub_enqueue_ret_unlink(struct stub_device *sdev, __u32 seqnum, __u32 status) @@ -99,6 +85,22 @@ void stub_complete(struct urb *urb) break; } + /* + * If the server breaks single SG request into the several URBs, the + * URBs must be reassembled before sending completed URB to the vhci. + * Don't wake up the tx thread until all the URBs are completed. + */ + if (priv->sgl) { + priv->completed_urbs++; + + /* Only save the first error status */ + if (urb->status && !priv->urb_status) + priv->urb_status = urb->status; + + if (priv->completed_urbs < priv->num_urbs) + return; + } + /* link a urb to the queue of tx. */ spin_lock_irqsave(&sdev->priv_lock, flags); if (sdev->ud.tcp_socket == NULL) { @@ -170,18 +172,22 @@ static int stub_send_ret_submit(struct stub_device *sdev) size_t total_size = 0; while ((priv = dequeue_from_priv_tx(sdev)) != NULL) { - int ret; - struct urb *urb = priv->urb; + struct urb *urb = priv->urbs[0]; struct usbip_header pdu_header; struct usbip_iso_packet_descriptor *iso_buffer = NULL; struct kvec *iov = NULL; + struct scatterlist *sg; + u32 actual_length = 0; int iovnum = 0; + int ret; + int i; txsize = 0; memset(&pdu_header, 0, sizeof(pdu_header)); memset(&msg, 0, sizeof(msg)); - if (urb->actual_length > 0 && !urb->transfer_buffer) { + if (urb->actual_length > 0 && !urb->transfer_buffer && + !urb->num_sgs) { dev_err(&sdev->udev->dev, "urb: actual_length %d transfer_buffer null\n", urb->actual_length); @@ -190,6 +196,11 @@ static int stub_send_ret_submit(struct stub_device *sdev) if (usb_pipetype(urb->pipe) == PIPE_ISOCHRONOUS) iovnum = 2 + urb->number_of_packets; + else if (usb_pipein(urb->pipe) && urb->actual_length > 0 && + urb->num_sgs) + iovnum = 1 + urb->num_sgs; + else if (usb_pipein(urb->pipe) && priv->sgl) + iovnum = 1 + priv->num_urbs; else iovnum = 2; @@ -206,6 +217,15 @@ static int stub_send_ret_submit(struct stub_device *sdev) setup_ret_submit_pdu(&pdu_header, urb); usbip_dbg_stub_tx("setup txdata seqnum: %d\n", pdu_header.base.seqnum); + + if (priv->sgl) { + for (i = 0; i < priv->num_urbs; i++) + actual_length += priv->urbs[i]->actual_length; + + pdu_header.u.ret_submit.status = priv->urb_status; + pdu_header.u.ret_submit.actual_length = actual_length; + } + usbip_header_correct_endian(&pdu_header, 1); iov[iovnum].iov_base = &pdu_header; @@ -214,12 +234,47 @@ static int stub_send_ret_submit(struct stub_device *sdev) txsize += sizeof(pdu_header); /* 2. setup transfer buffer */ - if (usb_pipein(urb->pipe) && + if (usb_pipein(urb->pipe) && priv->sgl) { + /* If the server split a single SG request into several + * URBs because the server's HCD doesn't support SG, + * reassemble the split URB buffers into a single + * return command. + */ + for (i = 0; i < priv->num_urbs; i++) { + iov[iovnum].iov_base = + priv->urbs[i]->transfer_buffer; + iov[iovnum].iov_len = + priv->urbs[i]->actual_length; + iovnum++; + } + txsize += actual_length; + } else if (usb_pipein(urb->pipe) && usb_pipetype(urb->pipe) != PIPE_ISOCHRONOUS && urb->actual_length > 0) { - iov[iovnum].iov_base = urb->transfer_buffer; - iov[iovnum].iov_len = urb->actual_length; - iovnum++; + if (urb->num_sgs) { + unsigned int copy = urb->actual_length; + int size; + + for_each_sg(urb->sg, sg, urb->num_sgs, i) { + if (copy == 0) + break; + + if (copy < sg->length) + size = copy; + else + size = sg->length; + + iov[iovnum].iov_base = sg_virt(sg); + iov[iovnum].iov_len = size; + + iovnum++; + copy -= size; + } + } else { + iov[iovnum].iov_base = urb->transfer_buffer; + iov[iovnum].iov_len = urb->actual_length; + iovnum++; + } txsize += urb->actual_length; } else if (usb_pipein(urb->pipe) && usb_pipetype(urb->pipe) == PIPE_ISOCHRONOUS) { diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c index 7f0d2213112111..da03451328cd14 100644 --- a/drivers/usb/usbip/usbip_common.c +++ b/drivers/usb/usbip/usbip_common.c @@ -695,8 +695,12 @@ EXPORT_SYMBOL_GPL(usbip_pad_iso); /* some members of urb must be substituted before. */ int usbip_recv_xbuff(struct usbip_device *ud, struct urb *urb) { - int ret; + struct scatterlist *sg; + int ret = 0; + int recv; int size; + int copy; + int i; if (ud->side == USBIP_STUB || ud->side == USBIP_VUDC) { /* the direction of urb must be OUT. */ @@ -716,29 +720,48 @@ int usbip_recv_xbuff(struct usbip_device *ud, struct urb *urb) if (!(size > 0)) return 0; - if (size > urb->transfer_buffer_length) { + if (size > urb->transfer_buffer_length) /* should not happen, probably malicious packet */ - if (ud->side == USBIP_STUB) { - usbip_event_add(ud, SDEV_EVENT_ERROR_TCP); - return 0; - } else { - usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); - return -EPIPE; - } - } + goto error; - ret = usbip_recv(ud->tcp_socket, urb->transfer_buffer, size); - if (ret != size) { - dev_err(&urb->dev->dev, "recv xbuf, %d\n", ret); - if (ud->side == USBIP_STUB || ud->side == USBIP_VUDC) { - usbip_event_add(ud, SDEV_EVENT_ERROR_TCP); - } else { - usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); - return -EPIPE; + if (urb->num_sgs) { + copy = size; + for_each_sg(urb->sg, sg, urb->num_sgs, i) { + int recv_size; + + if (copy < sg->length) + recv_size = copy; + else + recv_size = sg->length; + + recv = usbip_recv(ud->tcp_socket, sg_virt(sg), + recv_size); + + if (recv != recv_size) + goto error; + + copy -= recv; + ret += recv; } + + if (ret != size) + goto error; + } else { + ret = usbip_recv(ud->tcp_socket, urb->transfer_buffer, size); + if (ret != size) + goto error; } return ret; + +error: + dev_err(&urb->dev->dev, "recv xbuf, %d\n", ret); + if (ud->side == USBIP_STUB || ud->side == USBIP_VUDC) + usbip_event_add(ud, SDEV_EVENT_ERROR_TCP); + else + usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); + + return -EPIPE; } EXPORT_SYMBOL_GPL(usbip_recv_xbuff); diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c index 14b17ea81deb87..253e0affd3965d 100644 --- a/drivers/usb/usbip/vhci_hcd.c +++ b/drivers/usb/usbip/vhci_hcd.c @@ -716,7 +716,8 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag } vdev = &vhci_hcd->vdev[portnum-1]; - if (!urb->transfer_buffer && urb->transfer_buffer_length) { + if (!urb->transfer_buffer && !urb->num_sgs && + urb->transfer_buffer_length) { dev_dbg(dev, "Null URB transfer buffer\n"); return -EINVAL; } @@ -1162,6 +1163,15 @@ static int vhci_setup(struct usb_hcd *hcd) hcd->speed = HCD_USB3; hcd->self.root_hub->speed = USB_SPEED_SUPER; } + + /* + * Support SG. + * sg_tablesize is an arbitrary value to alleviate memory pressure + * on the host. + */ + hcd->self.sg_tablesize = 32; + hcd->self.no_sg_constraint = 1; + return 0; } diff --git a/drivers/usb/usbip/vhci_rx.c b/drivers/usb/usbip/vhci_rx.c index 1343037d00f9a9..3f998b605f0391 100644 --- a/drivers/usb/usbip/vhci_rx.c +++ b/drivers/usb/usbip/vhci_rx.c @@ -104,6 +104,9 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev, if (usbip_dbg_flag_vhci_rx) usbip_dump_urb(urb); + if (urb->num_sgs) + urb->transfer_flags &= ~URB_DMA_MAP_SG; + usbip_dbg_vhci_rx("now giveback urb %u\n", pdu->base.seqnum); spin_lock_irqsave(&vhci->lock, flags); diff --git a/drivers/usb/usbip/vhci_tx.c b/drivers/usb/usbip/vhci_tx.c index a9a663a578b623..93c139d884f342 100644 --- a/drivers/usb/usbip/vhci_tx.c +++ b/drivers/usb/usbip/vhci_tx.c @@ -19,6 +19,7 @@ #include #include +#include #include "usbip_common.h" #include "vhci.h" @@ -64,19 +65,23 @@ static struct vhci_priv *dequeue_from_priv_tx(struct vhci_device *vdev) static int vhci_send_cmd_submit(struct vhci_device *vdev) { + struct usbip_iso_packet_descriptor *iso_buffer = NULL; struct vhci_priv *priv = NULL; + struct scatterlist *sg; struct msghdr msg; - struct kvec iov[3]; + struct kvec *iov; size_t txsize; size_t total_size = 0; + int iovnum; + int err = -ENOMEM; + int i; while ((priv = dequeue_from_priv_tx(vdev)) != NULL) { int ret; struct urb *urb = priv->urb; struct usbip_header pdu_header; - struct usbip_iso_packet_descriptor *iso_buffer = NULL; txsize = 0; memset(&pdu_header, 0, sizeof(pdu_header)); @@ -86,18 +91,45 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev) usbip_dbg_vhci_tx("setup txdata urb seqnum %lu\n", priv->seqnum); + if (urb->num_sgs && usb_pipeout(urb->pipe)) + iovnum = 2 + urb->num_sgs; + else + iovnum = 3; + + iov = kcalloc(iovnum, sizeof(*iov), GFP_KERNEL); + if (!iov) { + usbip_event_add(&vdev->ud, SDEV_EVENT_ERROR_MALLOC); + return -ENOMEM; + } + + if (urb->num_sgs) + urb->transfer_flags |= URB_DMA_MAP_SG; + /* 1. setup usbip_header */ setup_cmd_submit_pdu(&pdu_header, urb); usbip_header_correct_endian(&pdu_header, 1); + iovnum = 0; - iov[0].iov_base = &pdu_header; - iov[0].iov_len = sizeof(pdu_header); + iov[iovnum].iov_base = &pdu_header; + iov[iovnum].iov_len = sizeof(pdu_header); txsize += sizeof(pdu_header); + iovnum++; /* 2. setup transfer buffer */ if (!usb_pipein(urb->pipe) && urb->transfer_buffer_length > 0) { - iov[1].iov_base = urb->transfer_buffer; - iov[1].iov_len = urb->transfer_buffer_length; + if (urb->num_sgs && + !usb_endpoint_xfer_isoc(&urb->ep->desc)) { + for_each_sg(urb->sg, sg, urb->num_sgs, i) { + iov[iovnum].iov_base = sg_virt(sg); + iov[iovnum].iov_len = sg->length; + iovnum++; + } + } else { + iov[iovnum].iov_base = urb->transfer_buffer; + iov[iovnum].iov_len = + urb->transfer_buffer_length; + iovnum++; + } txsize += urb->transfer_buffer_length; } @@ -109,23 +141,26 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev) if (!iso_buffer) { usbip_event_add(&vdev->ud, SDEV_EVENT_ERROR_MALLOC); - return -1; + goto err_iso_buffer; } - iov[2].iov_base = iso_buffer; - iov[2].iov_len = len; + iov[iovnum].iov_base = iso_buffer; + iov[iovnum].iov_len = len; + iovnum++; txsize += len; } - ret = kernel_sendmsg(vdev->ud.tcp_socket, &msg, iov, 3, txsize); + ret = kernel_sendmsg(vdev->ud.tcp_socket, &msg, iov, iovnum, + txsize); if (ret != txsize) { pr_err("sendmsg failed!, ret=%d for %zd\n", ret, txsize); - kfree(iso_buffer); usbip_event_add(&vdev->ud, VDEV_EVENT_ERROR_TCP); - return -1; + err = -EPIPE; + goto err_tx; } + kfree(iov); kfree(iso_buffer); usbip_dbg_vhci_tx("send txdata\n"); @@ -133,6 +168,13 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev) } return total_size; + +err_tx: + kfree(iso_buffer); +err_iso_buffer: + kfree(iov); + + return err; } static struct vhci_unlink *dequeue_from_unlink_tx(struct vhci_device *vdev) From 2db8e5be929ea01266e24c757a0a81d2768c5d20 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Thu, 4 Jul 2019 20:34:28 +0530 Subject: [PATCH 067/155] PCI: tegra: Enable Relaxed Ordering only for Tegra20 & Tegra30 commit 7be142caabc4780b13a522c485abc806de5c4114 upstream. The PCI Tegra controller conversion to a device tree configurable driver in commit d1523b52bff3 ("PCI: tegra: Move PCIe driver to drivers/pci/host") implied that code for the driver can be compiled in for a kernel supporting multiple platforms. Unfortunately, a blind move of the code did not check that some of the quirks that were applied in arch/arm (eg enabling Relaxed Ordering on all PCI devices - since the quirk hook erroneously matches PCI_ANY_ID for both Vendor-ID and Device-ID) are now applied in all kernels that compile the PCI Tegra controlled driver, DT and ACPI alike. This is completely wrong, in that enablement of Relaxed Ordering is only required by default in Tegra20 platforms as described in the Tegra20 Technical Reference Manual (available at https://developer.nvidia.com/embedded/downloads#?search=tegra%202 in Section 34.1, where it is mentioned that Relaxed Ordering bit needs to be enabled in its root ports to avoid deadlock in hardware) and in the Tegra30 platforms for the same reasons (unfortunately not documented in the TRM). There is no other strict requirement on PCI devices Relaxed Ordering enablement on any other Tegra platforms or PCI host bridge driver. Fix this quite upsetting situation by limiting the vendor and device IDs to which the Relaxed Ordering quirk applies to the root ports in question, reported above. Signed-off-by: Vidya Sagar [lorenzo.pieralisi@arm.com: completely rewrote the commit log/fixes tag] Signed-off-by: Lorenzo Pieralisi Acked-by: Thierry Reding Signed-off-by: Greg Kroah-Hartman --- drivers/pci/host/pci-tegra.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/pci/host/pci-tegra.c b/drivers/pci/host/pci-tegra.c index 8efd086c57c96e..5bf874f30466bd 100644 --- a/drivers/pci/host/pci-tegra.c +++ b/drivers/pci/host/pci-tegra.c @@ -607,12 +607,15 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0bf1, tegra_pcie_fixup_class); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e1c, tegra_pcie_fixup_class); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e1d, tegra_pcie_fixup_class); -/* Tegra PCIE requires relaxed ordering */ +/* Tegra20 and Tegra30 PCIE requires relaxed ordering */ static void tegra_pcie_relax_enable(struct pci_dev *dev) { pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_RELAX_EN); } -DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, tegra_pcie_relax_enable); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, 0x0bf0, tegra_pcie_relax_enable); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, 0x0bf1, tegra_pcie_relax_enable); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, 0x0e1c, tegra_pcie_relax_enable); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, 0x0e1d, tegra_pcie_relax_enable); static int tegra_pcie_request_resources(struct tegra_pcie *pcie) { From 2406e8896e3416c5185d10d804d546e28ecf4773 Mon Sep 17 00:00:00 2001 From: Radhey Shyam Pandey Date: Thu, 26 Sep 2019 16:20:58 +0530 Subject: [PATCH 068/155] dmaengine: xilinx_dma: Fix control reg update in vdma_channel_set_config [ Upstream commit 6c6de1ddb1be3840f2ed5cc9d009a622720940c9 ] In vdma_channel_set_config clear the delay, frame count and master mask before updating their new values. It avoids programming incorrect state when input parameters are different from default. Signed-off-by: Radhey Shyam Pandey Acked-by: Appana Durga Kedareswara rao Signed-off-by: Michal Simek Link: https://lore.kernel.org/r/1569495060-18117-3-git-send-email-radhey.shyam.pandey@xilinx.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/dma/xilinx/xilinx_dma.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c index 8722bcba489db8..2db352308e5c0a 100644 --- a/drivers/dma/xilinx/xilinx_dma.c +++ b/drivers/dma/xilinx/xilinx_dma.c @@ -72,6 +72,9 @@ #define XILINX_DMA_DMACR_CIRC_EN BIT(1) #define XILINX_DMA_DMACR_RUNSTOP BIT(0) #define XILINX_DMA_DMACR_FSYNCSRC_MASK GENMASK(6, 5) +#define XILINX_DMA_DMACR_DELAY_MASK GENMASK(31, 24) +#define XILINX_DMA_DMACR_FRAME_COUNT_MASK GENMASK(23, 16) +#define XILINX_DMA_DMACR_MASTER_MASK GENMASK(11, 8) #define XILINX_DMA_REG_DMASR 0x0004 #define XILINX_DMA_DMASR_EOL_LATE_ERR BIT(15) @@ -2057,8 +2060,10 @@ int xilinx_vdma_channel_set_config(struct dma_chan *dchan, chan->config.gen_lock = cfg->gen_lock; chan->config.master = cfg->master; + dmacr &= ~XILINX_DMA_DMACR_GENLOCK_EN; if (cfg->gen_lock && chan->genlock) { dmacr |= XILINX_DMA_DMACR_GENLOCK_EN; + dmacr &= ~XILINX_DMA_DMACR_MASTER_MASK; dmacr |= cfg->master << XILINX_DMA_DMACR_MASTER_SHIFT; } @@ -2072,11 +2077,13 @@ int xilinx_vdma_channel_set_config(struct dma_chan *dchan, chan->config.delay = cfg->delay; if (cfg->coalesc <= XILINX_DMA_DMACR_FRAME_COUNT_MAX) { + dmacr &= ~XILINX_DMA_DMACR_FRAME_COUNT_MASK; dmacr |= cfg->coalesc << XILINX_DMA_DMACR_FRAME_COUNT_SHIFT; chan->config.coalesc = cfg->coalesc; } if (cfg->delay <= XILINX_DMA_DMACR_DELAY_MAX) { + dmacr &= ~XILINX_DMA_DMACR_DELAY_MASK; dmacr |= cfg->delay << XILINX_DMA_DMACR_DELAY_SHIFT; chan->config.delay = cfg->delay; } From 54b9b5a469a5fbd16eeb68222b86eaa43dacffd0 Mon Sep 17 00:00:00 2001 From: Zhang Lixu Date: Wed, 16 Oct 2019 08:15:59 +0800 Subject: [PATCH 069/155] HID: intel-ish-hid: fix wrong error handling in ishtp_cl_alloc_tx_ring() [ Upstream commit 16ff7bf6dbcc6f77d2eec1ac9120edf44213c2f1 ] When allocating tx ring buffers failed, should free tx buffers, not rx buffers. Signed-off-by: Zhang Lixu Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/intel-ish-hid/ishtp/client-buffers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/intel-ish-hid/ishtp/client-buffers.c b/drivers/hid/intel-ish-hid/ishtp/client-buffers.c index b9b917d2d50db3..c41dbb167c91ba 100644 --- a/drivers/hid/intel-ish-hid/ishtp/client-buffers.c +++ b/drivers/hid/intel-ish-hid/ishtp/client-buffers.c @@ -90,7 +90,7 @@ int ishtp_cl_alloc_tx_ring(struct ishtp_cl *cl) return 0; out: dev_err(&cl->device->dev, "error in allocating Tx pool\n"); - ishtp_cl_free_rx_ring(cl); + ishtp_cl_free_tx_ring(cl); return -ENOMEM; } From 9f8d038bb276418cd128b7ebe0f2c8690916c446 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Tue, 8 Oct 2019 00:07:30 +0300 Subject: [PATCH 070/155] RDMA/qedr: Fix reported firmware version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit b806c94ee44e53233b8ce6c92d9078d9781786a5 ] Remove spaces from the reported firmware version string. Actual value: $ cat /sys/class/infiniband/qedr0/fw_ver 8. 37. 7. 0 Expected value: $ cat /sys/class/infiniband/qedr0/fw_ver 8.37.7.0 Fixes: ec72fce401c6 ("qedr: Add support for RoCE HW init") Signed-off-by: Kamal Heib Acked-by: Michal Kalderon  Link: https://lore.kernel.org/r/20191007210730.7173-1-kamalheib1@gmail.com Signed-off-by: Doug Ledford Signed-off-by: Sasha Levin --- drivers/infiniband/hw/qedr/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index ddb05b42e5e6ad..3e48ed64760b72 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -73,7 +73,7 @@ static void qedr_get_dev_fw_str(struct ib_device *ibdev, char *str) struct qedr_dev *qedr = get_qedr_dev(ibdev); u32 fw_ver = (u32)qedr->attr.fw_ver; - snprintf(str, IB_FW_VERSION_NAME_MAX, "%d. %d. %d. %d", + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d.%d", (fw_ver >> 24) & 0xFF, (fw_ver >> 16) & 0xFF, (fw_ver >> 8) & 0xFF, fw_ver & 0xFF); } From d905f0cea5e0f29b8aced5a38b1ce243051887de Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Tue, 24 Sep 2019 22:20:34 -0500 Subject: [PATCH 071/155] net/mlx5: prevent memory leak in mlx5_fpga_conn_create_cq [ Upstream commit c8c2a057fdc7de1cd16f4baa51425b932a42eb39 ] In mlx5_fpga_conn_create_cq if mlx5_vector2eqn fails the allocated memory should be released. Fixes: 537a50574175 ("net/mlx5: FPGA, Add high-speed connection routines") Signed-off-by: Navid Emamdoost Signed-off-by: Saeed Mahameed Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index c4392f741c5fe1..5212428031a43d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -462,8 +462,10 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) } err = mlx5_vector2eqn(mdev, smp_processor_id(), &eqn, &irqn); - if (err) + if (err) { + kvfree(in); goto err_cqwq; + } cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); MLX5_SET(cqc, cqc, log_cq_size, ilog2(cq_size)); From ec663d07c0d7b40b04aacf348fc648ca36c89994 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 18 Oct 2019 16:04:58 +0200 Subject: [PATCH 072/155] scsi: qla2xxx: fixup incorrect usage of host_byte [ Upstream commit 66cf50e65b183c863825f5c28a818e3f47a72e40 ] DRIVER_ERROR is a a driver byte setting, not a host byte. The qla2xxx driver should rather return DID_ERROR here to be in line with the other drivers. Link: https://lore.kernel.org/r/20191018140458.108278-1-hare@suse.de Signed-off-by: Hannes Reinecke Acked-by: Himanshu Madhani Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/qla2xxx/qla_bsg.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_bsg.c b/drivers/scsi/qla2xxx/qla_bsg.c index 2ea0ef93f5cbb9..7472d3882ad41c 100644 --- a/drivers/scsi/qla2xxx/qla_bsg.c +++ b/drivers/scsi/qla2xxx/qla_bsg.c @@ -258,7 +258,7 @@ qla2x00_process_els(struct bsg_job *bsg_job) srb_t *sp; const char *type; int req_sg_cnt, rsp_sg_cnt; - int rval = (DRIVER_ERROR << 16); + int rval = (DID_ERROR << 16); uint16_t nextlid = 0; if (bsg_request->msgcode == FC_BSG_RPT_ELS) { @@ -433,7 +433,7 @@ qla2x00_process_ct(struct bsg_job *bsg_job) struct Scsi_Host *host = fc_bsg_to_shost(bsg_job); scsi_qla_host_t *vha = shost_priv(host); struct qla_hw_data *ha = vha->hw; - int rval = (DRIVER_ERROR << 16); + int rval = (DID_ERROR << 16); int req_sg_cnt, rsp_sg_cnt; uint16_t loop_id; struct fc_port *fcport; @@ -1951,7 +1951,7 @@ qlafx00_mgmt_cmd(struct bsg_job *bsg_job) struct Scsi_Host *host = fc_bsg_to_shost(bsg_job); scsi_qla_host_t *vha = shost_priv(host); struct qla_hw_data *ha = vha->hw; - int rval = (DRIVER_ERROR << 16); + int rval = (DID_ERROR << 16); struct qla_mt_iocb_rqst_fx00 *piocb_rqst; srb_t *sp; int req_sg_cnt = 0, rsp_sg_cnt = 0; From 8466db95431f39fd8eec0ce057eab5e4711307de Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 11 Oct 2019 16:34:19 +0300 Subject: [PATCH 073/155] RDMA/uverbs: Prevent potential underflow [ Upstream commit a9018adfde809d44e71189b984fa61cc89682b5e ] The issue is in drivers/infiniband/core/uverbs_std_types_cq.c in the UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE) function. We check that: if (attr.comp_vector >= attrs->ufile->device->num_comp_vectors) { But we don't check if "attr.comp_vector" is negative. It could potentially lead to an array underflow. My concern would be where cq->vector is used in the create_cq() function from the cxgb4 driver. And really "attr.comp_vector" is appears as a u32 to user space so that's the right type to use. Fixes: 9ee79fce3642 ("IB/core: Add completion queue (cq) object actions") Link: https://lore.kernel.org/r/20191011133419.GA22905@mwanda Signed-off-by: Dan Carpenter Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin --- drivers/infiniband/core/uverbs.h | 2 +- include/rdma/ib_verbs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 37c8903e7fd0c0..8d79a48ccd3885 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -87,7 +87,7 @@ struct ib_uverbs_device { atomic_t refcount; - int num_comp_vectors; + u32 num_comp_vectors; struct completion comp; struct device *dev; struct ib_device __rcu *ib_dev; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b8a5118b6a4285..4a431933198945 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -306,7 +306,7 @@ enum ib_cq_creation_flags { struct ib_cq_init_attr { unsigned int cqe; - int comp_vector; + u32 comp_vector; u32 flags; }; From 107e5b0b9ed11d99e409e4a3e120237710c39e95 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Mon, 21 Oct 2019 12:01:57 +0200 Subject: [PATCH 074/155] net: openvswitch: free vport unless register_netdevice() succeeds [ Upstream commit 9464cc37f3671ee69cb1c00662b5e1f113a96b23 ] syzbot found the following crash on: HEAD commit: 1e78030e Merge tag 'mmc-v5.3-rc1' of git://git.kernel.org/.. git tree: upstream console output: https://syzkaller.appspot.com/x/log.txt?x=148d3d1a600000 kernel config: https://syzkaller.appspot.com/x/.config?x=30cef20daf3e9977 dashboard link: https://syzkaller.appspot.com/bug?extid=13210896153522fe1ee5 compiler: gcc (GCC) 9.0.0 20181231 (experimental) syz repro: https://syzkaller.appspot.com/x/repro.syz?x=136aa8c4600000 C reproducer: https://syzkaller.appspot.com/x/repro.c?x=109ba792600000 ===================================================================== BUG: memory leak unreferenced object 0xffff8881207e4100 (size 128): comm "syz-executor032", pid 7014, jiffies 4294944027 (age 13.830s) hex dump (first 32 bytes): 00 70 16 18 81 88 ff ff 80 af 8c 22 81 88 ff ff .p.........".... 00 b6 23 17 81 88 ff ff 00 00 00 00 00 00 00 00 ..#............. backtrace: [<000000000eb78212>] kmemleak_alloc_recursive include/linux/kmemleak.h:43 [inline] [<000000000eb78212>] slab_post_alloc_hook mm/slab.h:522 [inline] [<000000000eb78212>] slab_alloc mm/slab.c:3319 [inline] [<000000000eb78212>] kmem_cache_alloc_trace+0x145/0x2c0 mm/slab.c:3548 [<00000000006ea6c6>] kmalloc include/linux/slab.h:552 [inline] [<00000000006ea6c6>] kzalloc include/linux/slab.h:748 [inline] [<00000000006ea6c6>] ovs_vport_alloc+0x37/0xf0 net/openvswitch/vport.c:130 [<00000000f9a04a7d>] internal_dev_create+0x24/0x1d0 net/openvswitch/vport-internal_dev.c:164 [<0000000056ee7c13>] ovs_vport_add+0x81/0x190 net/openvswitch/vport.c:199 [<000000005434efc7>] new_vport+0x19/0x80 net/openvswitch/datapath.c:194 [<00000000b7b253f1>] ovs_dp_cmd_new+0x22f/0x410 net/openvswitch/datapath.c:1614 [<00000000e0988518>] genl_family_rcv_msg+0x2ab/0x5b0 net/netlink/genetlink.c:629 [<00000000d0cc9347>] genl_rcv_msg+0x54/0x9c net/netlink/genetlink.c:654 [<000000006694b647>] netlink_rcv_skb+0x61/0x170 net/netlink/af_netlink.c:2477 [<0000000088381f37>] genl_rcv+0x29/0x40 net/netlink/genetlink.c:665 [<00000000dad42a47>] netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] [<00000000dad42a47>] netlink_unicast+0x1ec/0x2d0 net/netlink/af_netlink.c:1328 [<0000000067e6b079>] netlink_sendmsg+0x270/0x480 net/netlink/af_netlink.c:1917 [<00000000aab08a47>] sock_sendmsg_nosec net/socket.c:637 [inline] [<00000000aab08a47>] sock_sendmsg+0x54/0x70 net/socket.c:657 [<000000004cb7c11d>] ___sys_sendmsg+0x393/0x3c0 net/socket.c:2311 [<00000000c4901c63>] __sys_sendmsg+0x80/0xf0 net/socket.c:2356 [<00000000c10abb2d>] __do_sys_sendmsg net/socket.c:2365 [inline] [<00000000c10abb2d>] __se_sys_sendmsg net/socket.c:2363 [inline] [<00000000c10abb2d>] __x64_sys_sendmsg+0x23/0x30 net/socket.c:2363 BUG: memory leak unreferenced object 0xffff88811723b600 (size 64): comm "syz-executor032", pid 7014, jiffies 4294944027 (age 13.830s) hex dump (first 32 bytes): 01 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 02 00 00 00 05 35 82 c1 .............5.. backtrace: [<00000000352f46d8>] kmemleak_alloc_recursive include/linux/kmemleak.h:43 [inline] [<00000000352f46d8>] slab_post_alloc_hook mm/slab.h:522 [inline] [<00000000352f46d8>] slab_alloc mm/slab.c:3319 [inline] [<00000000352f46d8>] __do_kmalloc mm/slab.c:3653 [inline] [<00000000352f46d8>] __kmalloc+0x169/0x300 mm/slab.c:3664 [<000000008e48f3d1>] kmalloc include/linux/slab.h:557 [inline] [<000000008e48f3d1>] ovs_vport_set_upcall_portids+0x54/0xd0 net/openvswitch/vport.c:343 [<00000000541e4f4a>] ovs_vport_alloc+0x7f/0xf0 net/openvswitch/vport.c:139 [<00000000f9a04a7d>] internal_dev_create+0x24/0x1d0 net/openvswitch/vport-internal_dev.c:164 [<0000000056ee7c13>] ovs_vport_add+0x81/0x190 net/openvswitch/vport.c:199 [<000000005434efc7>] new_vport+0x19/0x80 net/openvswitch/datapath.c:194 [<00000000b7b253f1>] ovs_dp_cmd_new+0x22f/0x410 net/openvswitch/datapath.c:1614 [<00000000e0988518>] genl_family_rcv_msg+0x2ab/0x5b0 net/netlink/genetlink.c:629 [<00000000d0cc9347>] genl_rcv_msg+0x54/0x9c net/netlink/genetlink.c:654 [<000000006694b647>] netlink_rcv_skb+0x61/0x170 net/netlink/af_netlink.c:2477 [<0000000088381f37>] genl_rcv+0x29/0x40 net/netlink/genetlink.c:665 [<00000000dad42a47>] netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] [<00000000dad42a47>] netlink_unicast+0x1ec/0x2d0 net/netlink/af_netlink.c:1328 [<0000000067e6b079>] netlink_sendmsg+0x270/0x480 net/netlink/af_netlink.c:1917 [<00000000aab08a47>] sock_sendmsg_nosec net/socket.c:637 [inline] [<00000000aab08a47>] sock_sendmsg+0x54/0x70 net/socket.c:657 [<000000004cb7c11d>] ___sys_sendmsg+0x393/0x3c0 net/socket.c:2311 [<00000000c4901c63>] __sys_sendmsg+0x80/0xf0 net/socket.c:2356 BUG: memory leak unreferenced object 0xffff8881228ca500 (size 128): comm "syz-executor032", pid 7015, jiffies 4294944622 (age 7.880s) hex dump (first 32 bytes): 00 f0 27 18 81 88 ff ff 80 ac 8c 22 81 88 ff ff ..'........".... 40 b7 23 17 81 88 ff ff 00 00 00 00 00 00 00 00 @.#............. backtrace: [<000000000eb78212>] kmemleak_alloc_recursive include/linux/kmemleak.h:43 [inline] [<000000000eb78212>] slab_post_alloc_hook mm/slab.h:522 [inline] [<000000000eb78212>] slab_alloc mm/slab.c:3319 [inline] [<000000000eb78212>] kmem_cache_alloc_trace+0x145/0x2c0 mm/slab.c:3548 [<00000000006ea6c6>] kmalloc include/linux/slab.h:552 [inline] [<00000000006ea6c6>] kzalloc include/linux/slab.h:748 [inline] [<00000000006ea6c6>] ovs_vport_alloc+0x37/0xf0 net/openvswitch/vport.c:130 [<00000000f9a04a7d>] internal_dev_create+0x24/0x1d0 net/openvswitch/vport-internal_dev.c:164 [<0000000056ee7c13>] ovs_vport_add+0x81/0x190 net/openvswitch/vport.c:199 [<000000005434efc7>] new_vport+0x19/0x80 net/openvswitch/datapath.c:194 [<00000000b7b253f1>] ovs_dp_cmd_new+0x22f/0x410 net/openvswitch/datapath.c:1614 [<00000000e0988518>] genl_family_rcv_msg+0x2ab/0x5b0 net/netlink/genetlink.c:629 [<00000000d0cc9347>] genl_rcv_msg+0x54/0x9c net/netlink/genetlink.c:654 [<000000006694b647>] netlink_rcv_skb+0x61/0x170 net/netlink/af_netlink.c:2477 [<0000000088381f37>] genl_rcv+0x29/0x40 net/netlink/genetlink.c:665 [<00000000dad42a47>] netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] [<00000000dad42a47>] netlink_unicast+0x1ec/0x2d0 net/netlink/af_netlink.c:1328 [<0000000067e6b079>] netlink_sendmsg+0x270/0x480 net/netlink/af_netlink.c:1917 [<00000000aab08a47>] sock_sendmsg_nosec net/socket.c:637 [inline] [<00000000aab08a47>] sock_sendmsg+0x54/0x70 net/socket.c:657 [<000000004cb7c11d>] ___sys_sendmsg+0x393/0x3c0 net/socket.c:2311 [<00000000c4901c63>] __sys_sendmsg+0x80/0xf0 net/socket.c:2356 [<00000000c10abb2d>] __do_sys_sendmsg net/socket.c:2365 [inline] [<00000000c10abb2d>] __se_sys_sendmsg net/socket.c:2363 [inline] [<00000000c10abb2d>] __x64_sys_sendmsg+0x23/0x30 net/socket.c:2363 ===================================================================== The function in net core, register_netdevice(), may fail with vport's destruction callback either invoked or not. After commit 309b66970ee2 ("net: openvswitch: do not free vport if register_netdevice() is failed."), the duty to destroy vport is offloaded from the driver OTOH, which ends up in the memory leak reported. It is fixed by releasing vport unless device is registered successfully. To do that, the callback assignment is defered until device is registered. Reported-by: syzbot+13210896153522fe1ee5@syzkaller.appspotmail.com Fixes: 309b66970ee2 ("net: openvswitch: do not free vport if register_netdevice() is failed.") Cc: Taehee Yoo Cc: Greg Rose Cc: Eric Dumazet Cc: Marcelo Ricardo Leitner Cc: Ying Xue Cc: Andrey Konovalov Signed-off-by: Hillf Danton Acked-by: Pravin B Shelar [sbrivio: this was sent to dev@openvswitch.org and never made its way to netdev -- resending original patch] Signed-off-by: Stefano Brivio Reviewed-by: Greg Rose Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/openvswitch/vport-internal_dev.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index b9377afeaba452..1c09ad457d2a94 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -156,7 +156,7 @@ static void do_setup(struct net_device *netdev) netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | IFF_PHONY_HEADROOM | IFF_NO_QUEUE; netdev->needs_free_netdev = true; - netdev->priv_destructor = internal_dev_destructor; + netdev->priv_destructor = NULL; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; @@ -178,7 +178,6 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) struct internal_dev *internal_dev; struct net_device *dev; int err; - bool free_vport = true; vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms); if (IS_ERR(vport)) { @@ -210,10 +209,9 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) rtnl_lock(); err = register_netdevice(vport->dev); - if (err) { - free_vport = false; + if (err) goto error_unlock; - } + vport->dev->priv_destructor = internal_dev_destructor; dev_set_promiscuity(vport->dev, 1); rtnl_unlock(); @@ -227,8 +225,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) error_free_netdev: free_netdev(dev); error_free_vport: - if (free_vport) - ovs_vport_free(vport); + ovs_vport_free(vport); error: return ERR_PTR(err); } From b23937d01b034acb1001a2091639de65699c0da1 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 22 Oct 2019 09:21:12 +0200 Subject: [PATCH 075/155] scsi: lpfc: Honor module parameter lpfc_use_adisc [ Upstream commit 0fd103ccfe6a06e40e2d9d8c91d96332cc9e1239 ] The initial lpfc_desc_set_adisc implementation in commit dea3101e0a5c ("lpfc: add Emulex FC driver version 8.0.28") enabled ADISC if cfg_use_adisc && RSCN_MODE && FCP_2_DEVICE In commit 92d7f7b0cde3 ("[SCSI] lpfc: NPIV: add NPIV support on top of SLI-3") this changed to (cfg_use_adisc && RSC_MODE) || FCP_2_DEVICE and later in commit ffc954936b13 ("[SCSI] lpfc 8.3.13: FC Discovery Fixes and enhancements.") to (cfg_use_adisc && RSC_MODE) || (FCP_2_DEVICE && FCP_TARGET) A customer reports that after a devloss, an ADISC failure is logged. It turns out the ADISC flag is set even the user explicitly set lpfc_use_adisc = 0. [Sat Dec 22 22:55:58 2018] lpfc 0000:82:00.0: 2:(0):0203 Devloss timeout on WWPN 50:01:43:80:12:8e:40:20 NPort x05df00 Data: x82000000 x8 xa [Sat Dec 22 23:08:20 2018] lpfc 0000:82:00.0: 2:(0):2755 ADISC failure DID:05DF00 Status:x9/x70000 [mkp: fixed Hannes' email] Fixes: 92d7f7b0cde3 ("[SCSI] lpfc: NPIV: add NPIV support on top of SLI-3") Cc: Dick Kennedy Cc: James Smart Link: https://lore.kernel.org/r/20191022072112.132268-1-dwagner@suse.de Reviewed-by: Hannes Reinecke Reviewed-by: James Smart Signed-off-by: Daniel Wagner Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/lpfc/lpfc_nportdisc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c index 36fb549eb4e86f..a0658d15822872 100644 --- a/drivers/scsi/lpfc/lpfc_nportdisc.c +++ b/drivers/scsi/lpfc/lpfc_nportdisc.c @@ -809,9 +809,9 @@ lpfc_disc_set_adisc(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) if (!(vport->fc_flag & FC_PT2PT)) { /* Check config parameter use-adisc or FCP-2 */ - if ((vport->cfg_use_adisc && (vport->fc_flag & FC_RSCN_MODE)) || + if (vport->cfg_use_adisc && ((vport->fc_flag & FC_RSCN_MODE) || ((ndlp->nlp_fcp_info & NLP_FCP_2_DEVICE) && - (ndlp->nlp_type & NLP_FCP_TARGET))) { + (ndlp->nlp_type & NLP_FCP_TARGET)))) { spin_lock_irq(shost->host_lock); ndlp->nlp_flag |= NLP_NPR_ADISC; spin_unlock_irq(shost->host_lock); From a599c315fc6118e69c98686b695eec252a62ef65 Mon Sep 17 00:00:00 2001 From: Himanshu Madhani Date: Tue, 22 Oct 2019 12:36:42 -0700 Subject: [PATCH 076/155] scsi: qla2xxx: Initialized mailbox to prevent driver load failure [ Upstream commit c2ff2a36eff60efb5e123c940115216d6bf65684 ] This patch fixes issue with Gen7 adapter in a blade environment where one of the ports will not be detected by driver. Firmware expects mailbox 11 to be set or cleared by driver for newer ISP. Following message is seen in the log file: [ 18.810892] qla2xxx [0000:d8:00.0]-1820:1: **** Failed=102 mb[0]=4005 mb[1]=37 mb[2]=20 mb[3]=8 [ 18.819596] cmd=2 **** [mkp: typos] Link: https://lore.kernel.org/r/20191022193643.7076-2-hmadhani@marvell.com Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/qla2xxx/qla_mbx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index 929ec087b8eb30..459481ce5872d9 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -624,6 +624,7 @@ qla2x00_execute_fw(scsi_qla_host_t *vha, uint32_t risc_addr) mcp->mb[2] = LSW(risc_addr); mcp->mb[3] = 0; mcp->mb[4] = 0; + mcp->mb[11] = 0; ha->flags.using_lr_setting = 0; if (IS_QLA25XX(ha) || IS_QLA81XX(ha) || IS_QLA83XX(ha) || IS_QLA27XX(ha)) { @@ -667,7 +668,7 @@ qla2x00_execute_fw(scsi_qla_host_t *vha, uint32_t risc_addr) if (ha->flags.exchoffld_enabled) mcp->mb[4] |= ENABLE_EXCHANGE_OFFLD; - mcp->out_mb |= MBX_4|MBX_3|MBX_2|MBX_1; + mcp->out_mb |= MBX_4 | MBX_3 | MBX_2 | MBX_1 | MBX_11; mcp->in_mb |= MBX_3 | MBX_2 | MBX_1; } else { mcp->mb[1] = LSW(risc_addr); From 78ec9c409e932b0a1a198cd8e20a3a66e346d72d Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Sat, 19 Oct 2019 17:34:35 +0200 Subject: [PATCH 077/155] ipvs: don't ignore errors in case refcounting ip_vs module fails [ Upstream commit 62931f59ce9cbabb934a431f48f2f1f441c605ac ] if the IPVS module is removed while the sync daemon is starting, there is a small gap where try_module_get() might fail getting the refcount inside ip_vs_use_count_inc(). Then, the refcounts of IPVS module are unbalanced, and the subsequent call to stop_sync_thread() causes the following splat: WARNING: CPU: 0 PID: 4013 at kernel/module.c:1146 module_put.part.44+0x15b/0x290 Modules linked in: ip_vs(-) nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 veth ip6table_filter ip6_tables iptable_filter binfmt_misc intel_rapl_msr intel_rapl_common crct10dif_pclmul crc32_pclmul ext4 mbcache jbd2 ghash_clmulni_intel snd_hda_codec_generic ledtrig_audio snd_hda_intel snd_intel_nhlt snd_hda_codec snd_hda_core snd_hwdep snd_seq snd_seq_device snd_pcm aesni_intel crypto_simd cryptd glue_helper joydev pcspkr snd_timer virtio_balloon snd soundcore i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c ata_generic pata_acpi virtio_net net_failover virtio_blk failover virtio_console qxl drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ata_piix ttm crc32c_intel serio_raw drm virtio_pci libata virtio_ring virtio floppy dm_mirror dm_region_hash dm_log dm_mod [last unloaded: nf_defrag_ipv6] CPU: 0 PID: 4013 Comm: modprobe Tainted: G W 5.4.0-rc1.upstream+ #741 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:module_put.part.44+0x15b/0x290 Code: 04 25 28 00 00 00 0f 85 18 01 00 00 48 83 c4 68 5b 5d 41 5c 41 5d 41 5e 41 5f c3 89 44 24 28 83 e8 01 89 c5 0f 89 57 ff ff ff <0f> 0b e9 78 ff ff ff 65 8b 1d 67 83 26 4a 89 db be 08 00 00 00 48 RSP: 0018:ffff888050607c78 EFLAGS: 00010297 RAX: 0000000000000003 RBX: ffffffffc1420590 RCX: ffffffffb5db0ef9 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffffffffc1420590 RBP: 00000000ffffffff R08: fffffbfff82840b3 R09: fffffbfff82840b3 R10: 0000000000000001 R11: fffffbfff82840b2 R12: 1ffff1100a0c0f90 R13: ffffffffc1420200 R14: ffff88804f533300 R15: ffff88804f533ca0 FS: 00007f8ea9720740(0000) GS:ffff888053800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f3245abe000 CR3: 000000004c28a006 CR4: 00000000001606f0 Call Trace: stop_sync_thread+0x3a3/0x7c0 [ip_vs] ip_vs_sync_net_cleanup+0x13/0x50 [ip_vs] ops_exit_list.isra.5+0x94/0x140 unregister_pernet_operations+0x29d/0x460 unregister_pernet_device+0x26/0x60 ip_vs_cleanup+0x11/0x38 [ip_vs] __x64_sys_delete_module+0x2d5/0x400 do_syscall_64+0xa5/0x4e0 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7f8ea8bf0db7 Code: 73 01 c3 48 8b 0d b9 80 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 b8 b0 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 89 80 2c 00 f7 d8 64 89 01 48 RSP: 002b:00007ffcd38d2fe8 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000000002436240 RCX: 00007f8ea8bf0db7 RDX: 0000000000000000 RSI: 0000000000000800 RDI: 00000000024362a8 RBP: 0000000000000000 R08: 00007f8ea8eba060 R09: 00007f8ea8c658a0 R10: 00007ffcd38d2a60 R11: 0000000000000206 R12: 0000000000000000 R13: 0000000000000001 R14: 00000000024362a8 R15: 0000000000000000 irq event stamp: 4538 hardirqs last enabled at (4537): [] quarantine_put+0x9e/0x170 hardirqs last disabled at (4538): [] trace_hardirqs_off_thunk+0x1a/0x20 softirqs last enabled at (4522): [] sk_common_release+0x169/0x2d0 softirqs last disabled at (4520): [] sk_common_release+0xbe/0x2d0 Check the return value of ip_vs_use_count_inc() and let its caller return proper error. Inside do_ip_vs_set_ctl() the module is already refcounted, we don't need refcount/derefcount there. Finally, in register_ip_vs_app() and start_sync_thread(), take the module refcount earlier and ensure it's released in the error path. Change since v1: - better return values in case of failure of ip_vs_use_count_inc(), thanks to Julian Anastasov - no need to increase/decrease the module refcount in ip_vs_set_ctl(), thanks to Julian Anastasov Signed-off-by: Davide Caratti Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Sasha Levin --- net/netfilter/ipvs/ip_vs_app.c | 12 ++++++++++-- net/netfilter/ipvs/ip_vs_ctl.c | 14 ++++---------- net/netfilter/ipvs/ip_vs_pe.c | 3 ++- net/netfilter/ipvs/ip_vs_sched.c | 3 ++- net/netfilter/ipvs/ip_vs_sync.c | 13 ++++++++++--- 5 files changed, 28 insertions(+), 17 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 299edc6add5a6a..363475b246f68d 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -198,21 +198,29 @@ struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app * mutex_lock(&__ip_vs_app_mutex); + /* increase the module use count */ + if (!ip_vs_use_count_inc()) { + err = -ENOENT; + goto out_unlock; + } + list_for_each_entry(a, &ipvs->app_list, a_list) { if (!strcmp(app->name, a->name)) { err = -EEXIST; + /* decrease the module use count */ + ip_vs_use_count_dec(); goto out_unlock; } } a = kmemdup(app, sizeof(*app), GFP_KERNEL); if (!a) { err = -ENOMEM; + /* decrease the module use count */ + ip_vs_use_count_dec(); goto out_unlock; } INIT_LIST_HEAD(&a->incs_list); list_add(&a->a_list, &ipvs->app_list); - /* increase the module use count */ - ip_vs_use_count_inc(); out_unlock: mutex_unlock(&__ip_vs_app_mutex); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 4648dccebf5950..6155fd6386bf46 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1197,7 +1197,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service *svc = NULL; /* increase the module use count */ - ip_vs_use_count_inc(); + if (!ip_vs_use_count_inc()) + return -ENOPROTOOPT; /* Lookup the scheduler by 'u->sched_name' */ if (strcmp(u->sched_name, "none")) { @@ -2395,9 +2396,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (copy_from_user(arg, user, len) != 0) return -EFAULT; - /* increase the module use count */ - ip_vs_use_count_inc(); - /* Handle daemons since they have another lock */ if (cmd == IP_VS_SO_SET_STARTDAEMON || cmd == IP_VS_SO_SET_STOPDAEMON) { @@ -2410,13 +2408,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) ret = -EINVAL; if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, sizeof(cfg.mcast_ifn)) <= 0) - goto out_dec; + return ret; cfg.syncid = dm->syncid; ret = start_sync_thread(ipvs, &cfg, dm->state); } else { ret = stop_sync_thread(ipvs, dm->state); } - goto out_dec; + return ret; } mutex_lock(&__ip_vs_mutex); @@ -2511,10 +2509,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) out_unlock: mutex_unlock(&__ip_vs_mutex); - out_dec: - /* decrease the module use count */ - ip_vs_use_count_dec(); - return ret; } diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 0df17caa8af6cb..714e7e05c1021f 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -67,7 +67,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe) struct ip_vs_pe *tmp; /* increase the module use count */ - ip_vs_use_count_inc(); + if (!ip_vs_use_count_inc()) + return -ENOENT; mutex_lock(&ip_vs_pe_mutex); /* Make sure that the pe with this name doesn't exist diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index a2ff7d746ebf40..3bd0ff36dc41bd 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -184,7 +184,8 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) } /* increase the module use count */ - ip_vs_use_count_inc(); + if (!ip_vs_use_count_inc()) + return -ENOENT; mutex_lock(&ip_vs_sched_mutex); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index b578ebb3d7ef60..b373e053ff9a3c 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1771,6 +1771,10 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", sizeof(struct ip_vs_sync_conn_v0)); + /* increase the module use count */ + if (!ip_vs_use_count_inc()) + return -ENOPROTOOPT; + /* Do not hold one mutex and then to block on another */ for (;;) { rtnl_lock(); @@ -1901,9 +1905,6 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, mutex_unlock(&ipvs->sync_mutex); rtnl_unlock(); - /* increase the module use count */ - ip_vs_use_count_inc(); - return 0; out: @@ -1933,11 +1934,17 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, } kfree(ti); } + + /* decrease the module use count */ + ip_vs_use_count_dec(); return result; out_early: mutex_unlock(&ipvs->sync_mutex); rtnl_unlock(); + + /* decrease the module use count */ + ip_vs_use_count_dec(); return result; } From 3a0018ef9628b70fdb6d5a30c8d8e17f6ac1f9e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 23 Oct 2019 09:53:03 -0700 Subject: [PATCH 078/155] ipvs: move old_secure_tcp into struct netns_ipvs [ Upstream commit c24b75e0f9239e78105f81c5f03a751641eb07ef ] syzbot reported the following issue : BUG: KCSAN: data-race in update_defense_level / update_defense_level read to 0xffffffff861a6260 of 4 bytes by task 3006 on cpu 1: update_defense_level+0x621/0xb30 net/netfilter/ipvs/ip_vs_ctl.c:177 defense_work_handler+0x3d/0xd0 net/netfilter/ipvs/ip_vs_ctl.c:225 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269 worker_thread+0xa0/0x800 kernel/workqueue.c:2415 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 write to 0xffffffff861a6260 of 4 bytes by task 7333 on cpu 0: update_defense_level+0xa62/0xb30 net/netfilter/ipvs/ip_vs_ctl.c:205 defense_work_handler+0x3d/0xd0 net/netfilter/ipvs/ip_vs_ctl.c:225 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269 worker_thread+0xa0/0x800 kernel/workqueue.c:2415 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 7333 Comm: kworker/0:5 Not tainted 5.4.0-rc3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events defense_work_handler Indeed, old_secure_tcp is currently a static variable, while it needs to be a per netns variable. Fixes: a0840e2e165a ("IPVS: netns, ip_vs_ctl local vars moved to ipvs struct.") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Simon Horman Signed-off-by: Sasha Levin --- include/net/ip_vs.h | 1 + net/netfilter/ipvs/ip_vs_ctl.c | 15 +++++++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 82bc9f0e8a769e..f4e5ac8aa3668e 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -891,6 +891,7 @@ struct netns_ipvs { struct delayed_work defense_work; /* Work handler */ int drop_rate; int drop_counter; + int old_secure_tcp; atomic_t dropentry; /* locks in ctl.c */ spinlock_t dropentry_lock; /* drop entry handling */ diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6155fd6386bf46..5ec80818ace2c1 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -98,7 +98,6 @@ static bool __ip_vs_addr_is_local_v6(struct net *net, static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; - static int old_secure_tcp = 0; int availmem; int nomem; int to_change = -1; @@ -179,35 +178,35 @@ static void update_defense_level(struct netns_ipvs *ipvs) spin_lock(&ipvs->securetcp_lock); switch (ipvs->sysctl_secure_tcp) { case 0: - if (old_secure_tcp >= 2) + if (ipvs->old_secure_tcp >= 2) to_change = 0; break; case 1: if (nomem) { - if (old_secure_tcp < 2) + if (ipvs->old_secure_tcp < 2) to_change = 1; ipvs->sysctl_secure_tcp = 2; } else { - if (old_secure_tcp >= 2) + if (ipvs->old_secure_tcp >= 2) to_change = 0; } break; case 2: if (nomem) { - if (old_secure_tcp < 2) + if (ipvs->old_secure_tcp < 2) to_change = 1; } else { - if (old_secure_tcp >= 2) + if (ipvs->old_secure_tcp >= 2) to_change = 0; ipvs->sysctl_secure_tcp = 1; } break; case 3: - if (old_secure_tcp < 2) + if (ipvs->old_secure_tcp < 2) to_change = 1; break; } - old_secure_tcp = ipvs->sysctl_secure_tcp; + ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; if (to_change >= 0) ip_vs_protocol_timeout_change(ipvs, ipvs->sysctl_secure_tcp > 1); From addfc90b11df2c7fa4a8aa94ff2c113fda76710f Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 21 Oct 2019 18:47:52 +0000 Subject: [PATCH 079/155] bonding: fix unexpected IFF_BONDING bit unset [ Upstream commit 65de65d9033750d2cf1b336c9d6e9da3a8b5cc6e ] The IFF_BONDING means bonding master or bonding slave device. ->ndo_add_slave() sets IFF_BONDING flag and ->ndo_del_slave() unsets IFF_BONDING flag. bond0<--bond1 Both bond0 and bond1 are bonding device and these should keep having IFF_BONDING flag until they are removed. But bond1 would lose IFF_BONDING at ->ndo_del_slave() because that routine do not check whether the slave device is the bonding type or not. This patch adds the interface type check routine before removing IFF_BONDING flag. Test commands: ip link add bond0 type bond ip link add bond1 type bond ip link set bond1 master bond0 ip link set bond1 nomaster ip link del bond1 type bond ip link add bond1 type bond Splat looks like: [ 226.665555] proc_dir_entry 'bonding/bond1' already registered [ 226.666440] WARNING: CPU: 0 PID: 737 at fs/proc/generic.c:361 proc_register+0x2a9/0x3e0 [ 226.667571] Modules linked in: bonding af_packet sch_fq_codel ip_tables x_tables unix [ 226.668662] CPU: 0 PID: 737 Comm: ip Not tainted 5.4.0-rc3+ #96 [ 226.669508] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 226.670652] RIP: 0010:proc_register+0x2a9/0x3e0 [ 226.671612] Code: 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 39 01 00 00 48 8b 04 24 48 89 ea 48 c7 c7 a0 0b 14 9f 48 8b b0 e 0 00 00 00 e8 07 e7 88 ff <0f> 0b 48 c7 c7 40 2d a5 9f e8 59 d6 23 01 48 8b 4c 24 10 48 b8 00 [ 226.675007] RSP: 0018:ffff888050e17078 EFLAGS: 00010282 [ 226.675761] RAX: dffffc0000000008 RBX: ffff88805fdd0f10 RCX: ffffffff9dd344e2 [ 226.676757] RDX: 0000000000000001 RSI: 0000000000000008 RDI: ffff88806c9f6b8c [ 226.677751] RBP: ffff8880507160f3 R08: ffffed100d940019 R09: ffffed100d940019 [ 226.678761] R10: 0000000000000001 R11: ffffed100d940018 R12: ffff888050716008 [ 226.679757] R13: ffff8880507160f2 R14: dffffc0000000000 R15: ffffed100a0e2c1e [ 226.680758] FS: 00007fdc217cc0c0(0000) GS:ffff88806c800000(0000) knlGS:0000000000000000 [ 226.681886] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 226.682719] CR2: 00007f49313424d0 CR3: 0000000050e46001 CR4: 00000000000606f0 [ 226.683727] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 226.684725] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 226.685681] Call Trace: [ 226.687089] proc_create_seq_private+0xb3/0xf0 [ 226.687778] bond_create_proc_entry+0x1b3/0x3f0 [bonding] [ 226.691458] bond_netdev_event+0x433/0x970 [bonding] [ 226.692139] ? __module_text_address+0x13/0x140 [ 226.692779] notifier_call_chain+0x90/0x160 [ 226.693401] register_netdevice+0x9b3/0xd80 [ 226.694010] ? alloc_netdev_mqs+0x854/0xc10 [ 226.694629] ? netdev_change_features+0xa0/0xa0 [ 226.695278] ? rtnl_create_link+0x2ed/0xad0 [ 226.695849] bond_newlink+0x2a/0x60 [bonding] [ 226.696422] __rtnl_newlink+0xb9f/0x11b0 [ 226.696968] ? rtnl_link_unregister+0x220/0x220 [ ... ] Fixes: 0b680e753724 ("[PATCH] bonding: Add priv_flag to avoid event mishandling") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/bonding/bond_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index cf8385a22de59b..5f6602cb191f24 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1779,7 +1779,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) bond_hw_addr_flush(bond_dev, slave_dev); err_close: - slave_dev->priv_flags &= ~IFF_BONDING; + if (!netif_is_bond_master(slave_dev)) + slave_dev->priv_flags &= ~IFF_BONDING; dev_close(slave_dev); err_restore_mac: @@ -1985,7 +1986,8 @@ static int __bond_release_one(struct net_device *bond_dev, else dev_set_mtu(slave_dev, slave->original_mtu); - slave_dev->priv_flags &= ~IFF_BONDING; + if (!netif_is_bond_master(slave_dev)) + slave_dev->priv_flags &= ~IFF_BONDING; bond_free_slave(slave); From 1861904a6092ed411203c6a02c75bfc45b27cc3c Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 21 Oct 2019 18:47:55 +0000 Subject: [PATCH 080/155] macsec: fix refcnt leak in module exit routine [ Upstream commit 2bce1ebed17da54c65042ec2b962e3234bad5b47 ] When a macsec interface is created, it increases a refcnt to a lower device(real device). when macsec interface is deleted, the refcnt is decreased in macsec_free_netdev(), which is ->priv_destructor() of macsec interface. The problem scenario is this. When nested macsec interfaces are exiting, the exit routine of the macsec module makes refcnt leaks. Test commands: ip link add dummy0 type dummy ip link add macsec0 link dummy0 type macsec ip link add macsec1 link macsec0 type macsec modprobe -rv macsec [ 208.629433] unregister_netdevice: waiting for macsec0 to become free. Usage count = 1 Steps of exit routine of macsec module are below. 1. Calls ->dellink() in __rtnl_link_unregister(). 2. Checks refcnt and wait refcnt to be 0 if refcnt is not 0 in netdev_run_todo(). 3. Calls ->priv_destruvtor() in netdev_run_todo(). Step2 checks refcnt, but step3 decreases refcnt. So, step2 waits forever. This patch makes the macsec module do not hold a refcnt of the lower device because it already holds a refcnt of the lower device with netdev_upper_dev_link(). Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/macsec.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index aa204c98af79cb..9bcb7c3e879f39 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -2993,12 +2993,10 @@ static const struct nla_policy macsec_rtnl_policy[IFLA_MACSEC_MAX + 1] = { static void macsec_free_netdev(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); - struct net_device *real_dev = macsec->real_dev; free_percpu(macsec->stats); free_percpu(macsec->secy.tx_sc.stats); - dev_put(real_dev); } static void macsec_setup(struct net_device *dev) @@ -3239,8 +3237,6 @@ static int macsec_newlink(struct net *net, struct net_device *dev, if (err < 0) return err; - dev_hold(real_dev); - macsec->nest_level = dev_get_nest_level(real_dev) + 1; netdev_lockdep_set_classes(dev); lockdep_set_class_and_subclass(&dev->addr_list_lock, From 0b7dad3f4eb31461ee4b3197812134dcea83759f Mon Sep 17 00:00:00 2001 From: Nikhil Badola Date: Mon, 21 Oct 2019 18:21:51 +0800 Subject: [PATCH 081/155] usb: fsl: Check memory resource before releasing it [ Upstream commit bc1e3a2dd0c9954fd956ac43ca2876bbea018c01 ] Check memory resource existence before releasing it to avoid NULL pointer dereference Signed-off-by: Nikhil Badola Reviewed-by: Ran Wang Reviewed-by: Peter Chen Signed-off-by: Felipe Balbi Signed-off-by: Sasha Levin --- drivers/usb/gadget/udc/fsl_udc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/fsl_udc_core.c b/drivers/usb/gadget/udc/fsl_udc_core.c index 7874c112f3fd8d..ee48c7938d6196 100644 --- a/drivers/usb/gadget/udc/fsl_udc_core.c +++ b/drivers/usb/gadget/udc/fsl_udc_core.c @@ -2569,7 +2569,7 @@ static int fsl_udc_remove(struct platform_device *pdev) dma_pool_destroy(udc_controller->td_pool); free_irq(udc_controller->irq, udc_controller); iounmap(dr_regs); - if (pdata->operating_mode == FSL_USB2_DR_DEVICE) + if (res && (pdata->operating_mode == FSL_USB2_DR_DEVICE)) release_mem_region(res->start, resource_size(res)); /* free udc --wait for the release() finished */ From 7c27c19ca85127e9c802cdf2c336e10242f5e89a Mon Sep 17 00:00:00 2001 From: Cristian Birsan Date: Fri, 4 Oct 2019 20:10:54 +0300 Subject: [PATCH 082/155] usb: gadget: udc: atmel: Fix interrupt storm in FIFO mode. [ Upstream commit ba3a1a915c49cc3023e4ddfc88f21e7514e82aa4 ] Fix interrupt storm generated by endpoints when working in FIFO mode. The TX_COMPLETE interrupt is used only by control endpoints processing. Do not enable it for other types of endpoints. Fixes: 914a3f3b3754 ("USB: add atmel_usba_udc driver") Signed-off-by: Cristian Birsan Signed-off-by: Felipe Balbi Signed-off-by: Sasha Levin --- drivers/usb/gadget/udc/atmel_usba_udc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c index cb66f982c313bf..39676824a2c6ff 100644 --- a/drivers/usb/gadget/udc/atmel_usba_udc.c +++ b/drivers/usb/gadget/udc/atmel_usba_udc.c @@ -488,9 +488,11 @@ static void submit_request(struct usba_ep *ep, struct usba_request *req) next_fifo_transaction(ep, req); if (req->last_transaction) { usba_ep_writel(ep, CTL_DIS, USBA_TX_PK_RDY); - usba_ep_writel(ep, CTL_ENB, USBA_TX_COMPLETE); + if (ep_is_control(ep)) + usba_ep_writel(ep, CTL_ENB, USBA_TX_COMPLETE); } else { - usba_ep_writel(ep, CTL_DIS, USBA_TX_COMPLETE); + if (ep_is_control(ep)) + usba_ep_writel(ep, CTL_DIS, USBA_TX_COMPLETE); usba_ep_writel(ep, CTL_ENB, USBA_TX_PK_RDY); } } From 6acb79f9940001954cba9374c6aba55bd096e09d Mon Sep 17 00:00:00 2001 From: Chandana Kishori Chiluveru Date: Tue, 1 Oct 2019 13:16:48 +0530 Subject: [PATCH 083/155] usb: gadget: composite: Fix possible double free memory bug [ Upstream commit 1c20c89b0421b52b2417bb0f62a611bc669eda1d ] composite_dev_cleanup call from the failure of configfs_composite_bind frees up the cdev->os_desc_req and cdev->req. If the previous calls of bind and unbind is successful these will carry stale values. Consider the below sequence of function calls: configfs_composite_bind() composite_dev_prepare() - Allocate cdev->req, cdev->req->buf composite_os_desc_req_prepare() - Allocate cdev->os_desc_req, cdev->os_desc_req->buf configfs_composite_unbind() composite_dev_cleanup() - free the cdev->os_desc_req->buf and cdev->req->buf Next composition switch configfs_composite_bind() - If it fails goto err_comp_cleanup will call the composite_dev_cleanup() function composite_dev_cleanup() - calls kfree up with the stale values of cdev->req->buf and cdev->os_desc_req from the previous configfs_composite_bind call. The free call on these stale values leads to double free. Hence, Fix this issue by setting request and buffer pointer to NULL after kfree. Signed-off-by: Chandana Kishori Chiluveru Signed-off-by: Felipe Balbi Signed-off-by: Sasha Levin --- drivers/usb/gadget/composite.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index 75c42393b64ba9..b29cd3979391ec 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -2187,14 +2187,18 @@ void composite_dev_cleanup(struct usb_composite_dev *cdev) usb_ep_dequeue(cdev->gadget->ep0, cdev->os_desc_req); kfree(cdev->os_desc_req->buf); + cdev->os_desc_req->buf = NULL; usb_ep_free_request(cdev->gadget->ep0, cdev->os_desc_req); + cdev->os_desc_req = NULL; } if (cdev->req) { if (cdev->setup_pending) usb_ep_dequeue(cdev->gadget->ep0, cdev->req); kfree(cdev->req->buf); + cdev->req->buf = NULL; usb_ep_free_request(cdev->gadget->ep0, cdev->req); + cdev->req = NULL; } cdev->next_string_id = 0; device_remove_file(&cdev->gadget->dev, &dev_attr_suspended); From e45760e46f7bf11545f3b5347cec3744c5e631f2 Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Mon, 26 Aug 2019 15:10:55 -0400 Subject: [PATCH 084/155] usb: gadget: configfs: fix concurrent issue between composite APIs [ Upstream commit 1a1c851bbd706ea9f3a9756c2d3db28523506d3b ] We meet several NULL pointer issues if configfs_composite_unbind and composite_setup (or composite_disconnect) are running together. These issues occur when do the function switch stress test, the configfs_compsoite_unbind is called from user mode by echo "" to /sys/../UDC entry, and meanwhile, the setup interrupt or disconnect interrupt occurs by hardware. The composite_setup will get the cdev from get_gadget_data, but configfs_composite_unbind will set gadget data as NULL, so the NULL pointer issue occurs. This concurrent is hard to reproduce by native kernel, but can be reproduced by android kernel. In this commit, we introduce one spinlock belongs to structure gadget_info since we can't use the same spinlock in usb_composite_dev due to exclusive running together between composite_setup and configfs_composite_unbind. And one bit flag 'unbind' to indicate the code is at unbind routine, this bit is needed due to we release the lock at during configfs_composite_unbind sometimes, and composite_setup may be run at that time. Several oops: oops 1: android_work: sent uevent USB_STATE=CONNECTED configfs-gadget gadget: super-speed config #1: b android_work: sent uevent USB_STATE=CONFIGURED init: Received control message 'start' for 'adbd' from pid: 3515 (system_server) Unable to handle kernel NULL pointer dereference at virtual address 0000002a init: Received control message 'stop' for 'adbd' from pid: 3375 (/vendor/bin/hw/android.hardware.usb@1.1-servic) Mem abort info: Exception class = DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000004 CM = 0, WnR = 0 user pgtable: 4k pages, 48-bit VAs, pgd = ffff8008f1b7f000 [000000000000002a] *pgd=0000000000000000 Internal error: Oops: 96000004 [#1] PREEMPT SMP Modules linked in: CPU: 4 PID: 2457 Comm: irq/125-5b11000 Not tainted 4.14.98-07846-g0b40a9b-dirty #16 Hardware name: Freescale i.MX8QM MEK (DT) task: ffff8008f2a98000 task.stack: ffff00000b7b8000 PC is at composite_setup+0x44/0x1508 LR is at android_setup+0xb8/0x13c pc : [] lr : [] pstate: 800001c5 sp : ffff00000b7bbb80 x29: ffff00000b7bbb80 x28: ffff8008f2a3c010 x27: 0000000000000001 x26: 0000000000000000 [1232/1897] audit: audit_lost=25791 audit_rate_limit=5 audit_backlog_limit=64 x25: 00000000ffffffa1 x24: ffff8008f2a3c010 audit: rate limit exceeded x23: 0000000000000409 x22: ffff000009c8e000 x21: ffff8008f7a8b428 x20: ffff00000afae000 x19: ffff0000089ff000 x18: 0000000000000000 x17: 0000000000000000 x16: ffff0000082b7c9c x15: 0000000000000000 x14: f1866f5b952aca46 x13: e35502e30d44349c x12: 0000000000000008 x11: 0000000000000008 x10: 0000000000000a30 x9 : ffff00000b7bbd00 x8 : ffff8008f2a98a90 x7 : ffff8008f27a9c90 x6 : 0000000000000001 x5 : 0000000000000000 x4 : 0000000000000001 x3 : 0000000000000000 x2 : 0000000000000006 x1 : ffff0000089ff8d0 x0 : 732a010310b9ed00 X7: 0xffff8008f27a9c10: 9c10 00000002 00000000 00000001 00000000 13110000 ffff0000 00000002 00208040 9c30 00000000 00000000 00000000 00000000 00000000 00000005 00000029 00000000 9c50 00051778 00000001 f27a8e00 ffff8008 00000005 00000000 00000078 00000078 9c70 00000078 00000000 09031d48 ffff0000 00100000 00000000 00400000 00000000 9c90 00000001 00000000 00000000 00000000 00000000 00000000 ffefb1a0 ffff8008 9cb0 f27a9ca8 ffff8008 00000000 00000000 b9d88037 00000173 1618a3eb 00000001 9cd0 870a792a 0000002e 16188fe6 00000001 0000242b 00000000 00000000 00000000 using random self ethernet address 9cf0 019a4646 00000000 000547f3 00000000 ecfd6c33 00000002 00000000 using random host ethernet address 00000000 X8: 0xffff8008f2a98a10: 8a10 00000000 00000000 f7788d00 ffff8008 00000001 00000000 00000000 00000000 8a30 eb218000 ffff8008 f2a98000 ffff8008 f2a98000 ffff8008 09885000 ffff0000 8a50 f34df480 ffff8008 00000000 00000000 f2a98648 ffff8008 09c8e000 ffff0000 8a70 fff2c800 ffff8008 09031d48 ffff0000 0b7bbd00 ffff0000 0b7bbd00 ffff0000 8a90 080861bc ffff0000 00000000 00000000 00000000 00000000 00000000 00000000 8ab0 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 8ad0 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 8af0 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 X21: 0xffff8008f7a8b3a8: b3a8 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 b3c8 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 b3e8 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 b408 00000000 00000000 00000000 00000000 00000000 00000000 00000001 00000000 b428 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 b448 0053004d 00540046 00300031 00010030 eb07b520 ffff8008 20011201 00000003 b468 e418d109 0104404e 00010302 00000000 eb07b558 ffff8008 eb07b558 ffff8008 b488 f7a8b488 ffff8008 f7a8b488 ffff8008 f7a8b300 ffff8008 00000000 00000000 X24: 0xffff8008f2a3bf90: bf90 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 bfb0 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 bfd0 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 bff0 00000000 00000000 00000000 00000000 f76c8010 ffff8008 f76c8010 ffff8008 c010 00000000 00000000 f2a3c018 ffff8008 f2a3c018 ffff8008 08a067dc ffff0000 c030 f2a5a000 ffff8008 091c3650 ffff0000 f716fd18 ffff8008 f716fe30 ffff8008 c050 f2ce4a30 ffff8008 00000000 00000005 00000000 00000000 095d1568 ffff0000 c070 f76c8010 ffff8008 f2ce4b00 ffff8008 095cac68 ffff0000 f2a5a028 ffff8008 X28: 0xffff8008f2a3bf90: bf90 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 bfb0 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 bfd0 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 bff0 00000000 00000000 00000000 00000000 f76c8010 ffff8008 f76c8010 ffff8008 c010 00000000 00000000 f2a3c018 ffff8008 f2a3c018 ffff8008 08a067dc ffff0000 c030 f2a5a000 ffff8008 091c3650 ffff0000 f716fd18 ffff8008 f716fe30 ffff8008 c050 f2ce4a30 ffff8008 00000000 00000005 00000000 00000000 095d1568 ffff0000 c070 f76c8010 ffff8008 f2ce4b00 ffff8008 095cac68 ffff0000 f2a5a028 ffff8008 Process irq/125-5b11000 (pid: 2457, stack limit = 0xffff00000b7b8000) Call trace: Exception stack(0xffff00000b7bba40 to 0xffff00000b7bbb80) ba40: 732a010310b9ed00 ffff0000089ff8d0 0000000000000006 0000000000000000 ba60: 0000000000000001 0000000000000000 0000000000000001 ffff8008f27a9c90 ba80: ffff8008f2a98a90 ffff00000b7bbd00 0000000000000a30 0000000000000008 baa0: 0000000000000008 e35502e30d44349c f1866f5b952aca46 0000000000000000 bac0: ffff0000082b7c9c 0000000000000000 0000000000000000 ffff0000089ff000 bae0: ffff00000afae000 ffff8008f7a8b428 ffff000009c8e000 0000000000000409 bb00: ffff8008f2a3c010 00000000ffffffa1 0000000000000000 0000000000000001 bb20: ffff8008f2a3c010 ffff00000b7bbb80 ffff000008a032fc ffff00000b7bbb80 bb40: ffff0000089ffb3c 00000000800001c5 ffff00000b7bbb80 732a010310b9ed00 bb60: ffffffffffffffff ffff0000080f777c ffff00000b7bbb80 ffff0000089ffb3c [] composite_setup+0x44/0x1508 [] android_setup+0xb8/0x13c [] cdns3_ep0_delegate_req+0x44/0x70 [] cdns3_check_ep0_interrupt_proceed+0x33c/0x654 [] cdns3_device_thread_irq_handler+0x4b0/0x4bc [] cdns3_thread_irq+0x48/0x68 [] irq_thread_fn+0x28/0x88 [] irq_thread+0x13c/0x228 [] kthread+0x104/0x130 [] ret_from_fork+0x10/0x18 oops2: composite_disconnect: Calling disconnect on a Gadget that is not connected android_work: did not send uevent (0 0 (null)) init: Received control message 'stop' for 'adbd' from pid: 3359 (/vendor/bin/hw/android.hardware.usb@1.1-service.imx) init: Sending signal 9 to service 'adbd' (pid 22343) process group... ------------[ cut here ]------------ audit: audit_lost=180038 audit_rate_limit=5 audit_backlog_limit=64 audit: rate limit exceeded WARNING: CPU: 0 PID: 3468 at kernel_imx/drivers/usb/gadget/composite.c:2009 composite_disconnect+0x80/0x88 Modules linked in: CPU: 0 PID: 3468 Comm: HWC-UEvent-Thre Not tainted 4.14.98-07846-g0b40a9b-dirty #16 Hardware name: Freescale i.MX8QM MEK (DT) task: ffff8008f2349c00 task.stack: ffff00000b0a8000 PC is at composite_disconnect+0x80/0x88 LR is at composite_disconnect+0x80/0x88 pc : [] lr : [] pstate: 600001c5 sp : ffff000008003dd0 x29: ffff000008003dd0 x28: ffff8008f2349c00 x27: ffff000009885018 x26: ffff000008004000 Timeout for IPC response! x25: ffff000009885018 x24: ffff000009c8e280 x23: ffff8008f2d98010 x22: 00000000000001c0 x21: ffff8008f2d98394 x20: ffff8008f2d98010 x19: 0000000000000000 x18: 0000e3956f4f075a fxos8700 4-001e: i2c block read acc failed x17: 0000e395735727e8 x16: ffff00000829f4d4 x15: ffffffffffffffff x14: 7463656e6e6f6320 x13: 746f6e2009090920 x12: 7369207461687420 x11: 7465676461472061 x10: 206e6f207463656e x9 : 6e6f637369642067 x8 : ffff000009c8e280 x7 : ffff0000086ca6cc x6 : ffff000009f15e78 x5 : 0000000000000000 x4 : 0000000000000000 x3 : ffffffffffffffff x2 : c3f28b86000c3900 x1 : c3f28b86000c3900 x0 : 000000000000004e X20: 0xffff8008f2d97f90: 7f90 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 7fb0 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 libprocessgroup: Failed to kill process cgroup uid 0 pid 22343 in 215ms, 1 processes remain 7fd0 Timeout for IPC response! 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 using random self ethernet address 7ff0 00000000 00000000 00000000 00000000 f76c8010 ffff8008 f76c8010 ffff8008 8010 00000100 00000000 f2d98018 ffff8008 f2d98018 ffff8008 08a067dc using random host ethernet address ffff0000 8030 f206d800 ffff8008 091c3650 ffff0000 f7957b18 ffff8008 f7957730 ffff8008 8050 f716a630 ffff8008 00000000 00000005 00000000 00000000 095d1568 ffff0000 8070 f76c8010 ffff8008 f716a800 ffff8008 095cac68 ffff0000 f206d828 ffff8008 X21: 0xffff8008f2d98314: 8314 ffff8008 00000000 00000000 00000000 00000000 00000000 00000000 00000000 8334 00000000 00000000 00000000 00000000 00000000 08a04cf4 ffff0000 00000000 8354 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 8374 00000000 00000000 00000000 00001001 00000000 00000000 00000000 00000000 8394 e4bbe4bb 0f230000 ffff0000 0afae000 ffff0000 ae001000 00000000 f206d400 Timeout for IPC response! 83b4 ffff8008 00000000 00000000 f7957b18 ffff8008 f7957718 ffff8008 f7957018 83d4 ffff8008 f7957118 ffff8008 f7957618 ffff8008 f7957818 ffff8008 f7957918 83f4 ffff8008 f7957d18 ffff8008 00000000 00000000 00000000 00000000 00000000 X23: 0xffff8008f2d97f90: 7f90 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 7fb0 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 7fd0 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 7ff0 00000000 00000000 00000000 00000000 f76c8010 ffff8008 f76c8010 ffff8008 8010 00000100 00000000 f2d98018 ffff8008 f2d98018 ffff8008 08a067dc ffff0000 8030 f206d800 ffff8008 091c3650 ffff0000 f7957b18 ffff8008 f7957730 ffff8008 8050 f716a630 ffff8008 00000000 00000005 00000000 00000000 095d1568 ffff0000 8070 f76c8010 ffff8008 f716a800 ffff8008 095cac68 ffff0000 f206d828 ffff8008 X28: 0xffff8008f2349b80: 9b80 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 9ba0 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 9bc0 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 9be0 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 9c00 00000022 00000000 ffffffff ffffffff 00010001 00000000 00000000 00000000 9c20 0b0a8000 ffff0000 00000002 00404040 00000000 00000000 00000000 00000000 9c40 00000001 00000000 00000001 00000000 001ebd44 00000001 f390b800 ffff8008 9c60 00000000 00000001 00000070 00000070 00000070 00000000 09031d48 ffff0000 Call trace: Exception stack(0xffff000008003c90 to 0xffff000008003dd0) 3c80: 000000000000004e c3f28b86000c3900 3ca0: c3f28b86000c3900 ffffffffffffffff 0000000000000000 0000000000000000 3cc0: ffff000009f15e78 ffff0000086ca6cc ffff000009c8e280 6e6f637369642067 3ce0: 206e6f207463656e 7465676461472061 7369207461687420 746f6e2009090920 3d00: 7463656e6e6f6320 ffffffffffffffff ffff00000829f4d4 0000e395735727e8 3d20: 0000e3956f4f075a 0000000000000000 ffff8008f2d98010 ffff8008f2d98394 3d40: 00000000000001c0 ffff8008f2d98010 ffff000009c8e280 ffff000009885018 3d60: ffff000008004000 ffff000009885018 ffff8008f2349c00 ffff000008003dd0 3d80: ffff0000089ff9b0 ffff000008003dd0 ffff0000089ff9b0 00000000600001c5 3da0: ffff8008f33f2cd8 0000000000000000 0000ffffffffffff 0000000000000000 init: Received control message 'start' for 'adbd' from pid: 3359 (/vendor/bin/hw/android.hardware.usb@1.1-service.imx) 3dc0: ffff000008003dd0 ffff0000089ff9b0 [] composite_disconnect+0x80/0x88 [] android_disconnect+0x3c/0x68 [] cdns3_device_irq_handler+0xfc/0x2c8 [] cdns3_irq+0x44/0x94 [] __handle_irq_event_percpu+0x60/0x24c [] handle_irq_event+0x58/0xc0 [] handle_fasteoi_irq+0x98/0x180 [] generic_handle_irq+0x24/0x38 [] __handle_domain_irq+0x60/0xac [] gic_handle_irq+0xd4/0x17c Signed-off-by: Peter Chen Signed-off-by: Felipe Balbi Signed-off-by: Sasha Levin --- drivers/usb/gadget/configfs.c | 110 ++++++++++++++++++++++++++++++++-- 1 file changed, 105 insertions(+), 5 deletions(-) diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index aeb9f3c4052157..d0143d02e2f7e6 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -60,6 +60,8 @@ struct gadget_info { bool use_os_desc; char b_vendor_code; char qw_sign[OS_STRING_QW_SIGN_LEN]; + spinlock_t spinlock; + bool unbind; }; static inline struct gadget_info *to_gadget_info(struct config_item *item) @@ -1243,6 +1245,7 @@ static int configfs_composite_bind(struct usb_gadget *gadget, int ret; /* the gi->lock is hold by the caller */ + gi->unbind = 0; cdev->gadget = gadget; set_gadget_data(gadget, cdev); ret = composite_dev_prepare(composite, cdev); @@ -1375,31 +1378,128 @@ static void configfs_composite_unbind(struct usb_gadget *gadget) { struct usb_composite_dev *cdev; struct gadget_info *gi; + unsigned long flags; /* the gi->lock is hold by the caller */ cdev = get_gadget_data(gadget); gi = container_of(cdev, struct gadget_info, cdev); + spin_lock_irqsave(&gi->spinlock, flags); + gi->unbind = 1; + spin_unlock_irqrestore(&gi->spinlock, flags); kfree(otg_desc[0]); otg_desc[0] = NULL; purge_configs_funcs(gi); composite_dev_cleanup(cdev); usb_ep_autoconfig_reset(cdev->gadget); + spin_lock_irqsave(&gi->spinlock, flags); cdev->gadget = NULL; set_gadget_data(gadget, NULL); + spin_unlock_irqrestore(&gi->spinlock, flags); +} + +static int configfs_composite_setup(struct usb_gadget *gadget, + const struct usb_ctrlrequest *ctrl) +{ + struct usb_composite_dev *cdev; + struct gadget_info *gi; + unsigned long flags; + int ret; + + cdev = get_gadget_data(gadget); + if (!cdev) + return 0; + + gi = container_of(cdev, struct gadget_info, cdev); + spin_lock_irqsave(&gi->spinlock, flags); + cdev = get_gadget_data(gadget); + if (!cdev || gi->unbind) { + spin_unlock_irqrestore(&gi->spinlock, flags); + return 0; + } + + ret = composite_setup(gadget, ctrl); + spin_unlock_irqrestore(&gi->spinlock, flags); + return ret; +} + +static void configfs_composite_disconnect(struct usb_gadget *gadget) +{ + struct usb_composite_dev *cdev; + struct gadget_info *gi; + unsigned long flags; + + cdev = get_gadget_data(gadget); + if (!cdev) + return; + + gi = container_of(cdev, struct gadget_info, cdev); + spin_lock_irqsave(&gi->spinlock, flags); + cdev = get_gadget_data(gadget); + if (!cdev || gi->unbind) { + spin_unlock_irqrestore(&gi->spinlock, flags); + return; + } + + composite_disconnect(gadget); + spin_unlock_irqrestore(&gi->spinlock, flags); +} + +static void configfs_composite_suspend(struct usb_gadget *gadget) +{ + struct usb_composite_dev *cdev; + struct gadget_info *gi; + unsigned long flags; + + cdev = get_gadget_data(gadget); + if (!cdev) + return; + + gi = container_of(cdev, struct gadget_info, cdev); + spin_lock_irqsave(&gi->spinlock, flags); + cdev = get_gadget_data(gadget); + if (!cdev || gi->unbind) { + spin_unlock_irqrestore(&gi->spinlock, flags); + return; + } + + composite_suspend(gadget); + spin_unlock_irqrestore(&gi->spinlock, flags); +} + +static void configfs_composite_resume(struct usb_gadget *gadget) +{ + struct usb_composite_dev *cdev; + struct gadget_info *gi; + unsigned long flags; + + cdev = get_gadget_data(gadget); + if (!cdev) + return; + + gi = container_of(cdev, struct gadget_info, cdev); + spin_lock_irqsave(&gi->spinlock, flags); + cdev = get_gadget_data(gadget); + if (!cdev || gi->unbind) { + spin_unlock_irqrestore(&gi->spinlock, flags); + return; + } + + composite_resume(gadget); + spin_unlock_irqrestore(&gi->spinlock, flags); } static const struct usb_gadget_driver configfs_driver_template = { .bind = configfs_composite_bind, .unbind = configfs_composite_unbind, - .setup = composite_setup, - .reset = composite_disconnect, - .disconnect = composite_disconnect, + .setup = configfs_composite_setup, + .reset = configfs_composite_disconnect, + .disconnect = configfs_composite_disconnect, - .suspend = composite_suspend, - .resume = composite_resume, + .suspend = configfs_composite_suspend, + .resume = configfs_composite_resume, .max_speed = USB_SPEED_SUPER, .driver = { From 5e7777e53d9043ff88109152958b22aafb6a2a8b Mon Sep 17 00:00:00 2001 From: Yinbo Zhu Date: Mon, 29 Jul 2019 14:46:07 +0800 Subject: [PATCH 085/155] usb: dwc3: remove the call trace of USBx_GFLADJ [ Upstream commit a7d9874c6f3fbc8d25cd9ceba35b6822612c4ebf ] layerscape board sometimes reported some usb call trace, that is due to kernel sent LPM tokerns automatically when it has no pending transfers and think that the link is idle enough to enter L1, which procedure will ask usb register has a recovery,then kernel will compare USBx_GFLADJ and set GFLADJ_30MHZ, GFLADJ_30MHZ_REG until GFLADJ_30MHZ is equal 0x20, if the conditions were met then issue occur, but whatever the conditions whether were met that usb is all need keep GFLADJ_30MHZ of value is 0x20 (xhci spec ask use GFLADJ_30MHZ to adjust any offset from clock source that generates the clock that drives the SOF counter, 0x20 is default value of it)That is normal logic, so need remove the call trace. Signed-off-by: Yinbo Zhu Signed-off-by: Felipe Balbi Signed-off-by: Sasha Levin --- drivers/usb/dwc3/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index 9b093978bd24d0..48755c501201d6 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -282,8 +282,7 @@ static void dwc3_frame_length_adjustment(struct dwc3 *dwc) reg = dwc3_readl(dwc->regs, DWC3_GFLADJ); dft = reg & DWC3_GFLADJ_30MHZ_MASK; - if (!dev_WARN_ONCE(dwc->dev, dft == dwc->fladj, - "request value same as default, ignoring\n")) { + if (dft != dwc->fladj) { reg &= ~DWC3_GFLADJ_30MHZ_MASK; reg |= DWC3_GFLADJ_30MHZ_SDBND_SEL | dwc->fladj; dwc3_writel(dwc->regs, DWC3_GFLADJ, reg); From 938449fdd37743e72eb386272360543468a2a101 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Wed, 23 Oct 2019 10:09:54 -0500 Subject: [PATCH 086/155] perf/x86/amd/ibs: Fix reading of the IBS OpData register and thus precise RIP validity [ Upstream commit 317b96bb14303c7998dbcd5bc606bd8038fdd4b4 ] The loop that reads all the IBS MSRs into *buf stopped one MSR short of reading the IbsOpData register, which contains the RipInvalid status bit. Fix the offset_max assignment so the MSR gets read, so the RIP invalid evaluation is based on what the IBS h/w output, instead of what was left in memory. Signed-off-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: d47e8238cd76 ("perf/x86-ibs: Take instruction pointer from ibs sample") Link: https://lkml.kernel.org/r/20191023150955.30292-1-kim.phillips@amd.com Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin --- arch/x86/events/amd/ibs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 7a86fbc07ddc11..4deecdb26ab307 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -625,7 +625,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) if (event->attr.sample_type & PERF_SAMPLE_RAW) offset_max = perf_ibs->offset_max; else if (check_rip) - offset_max = 2; + offset_max = 3; else offset_max = 1; do { From 09bbc0a0c399aace431e2701979730d33eb2584c Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Wed, 23 Oct 2019 10:09:55 -0500 Subject: [PATCH 087/155] perf/x86/amd/ibs: Handle erratum #420 only on the affected CPU family (10h) [ Upstream commit e431e79b60603079d269e0c2a5177943b95fa4b6 ] This saves us writing the IBS control MSR twice when disabling the event. I searched revision guides for all families since 10h, and did not find occurrence of erratum #420, nor anything remotely similar: so we isolate the secondary MSR write to family 10h only. Also unconditionally update the count mask for IBS Op implementations that have read & writeable current count (CurCnt) fields in addition to the MaxCnt field. These bits were reserved on prior implementations, and therefore shouldn't have negative impact. Signed-off-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: c9574fe0bdb9 ("perf/x86-ibs: Implement workaround for IBS erratum #420") Link: https://lkml.kernel.org/r/20191023150955.30292-2-kim.phillips@amd.com Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin --- arch/x86/events/amd/ibs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 4deecdb26ab307..f24e9adaa31671 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -389,7 +389,8 @@ static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, struct hw_perf_event *hwc, u64 config) { config &= ~perf_ibs->cnt_mask; - wrmsrl(hwc->config_base, config); + if (boot_cpu_data.x86 == 0x10) + wrmsrl(hwc->config_base, config); config &= ~perf_ibs->enable_mask; wrmsrl(hwc->config_base, config); } @@ -564,7 +565,8 @@ static struct perf_ibs perf_ibs_op = { }, .msr = MSR_AMD64_IBSOPCTL, .config_mask = IBS_OP_CONFIG_MASK, - .cnt_mask = IBS_OP_MAX_CNT, + .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT | + IBS_OP_CUR_CNT_RAND, .enable_mask = IBS_OP_ENABLE, .valid_mask = IBS_OP_VAL, .max_period = IBS_OP_MAX_CNT << 4, From faa06698c0e94279bbf03ae95cf4f8063c9235f6 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 28 Oct 2019 10:52:35 -0400 Subject: [PATCH 088/155] USB: Skip endpoints with 0 maxpacket length [ Upstream commit d482c7bb0541d19dea8bff437a9f3c5563b5b2d2 ] Endpoints with a maxpacket length of 0 are probably useless. They can't transfer any data, and it's not at all unlikely that an HCD will crash or hang when trying to handle an URB for such an endpoint. Currently the USB core does not check for endpoints having a maxpacket value of 0. This patch adds a check, printing a warning and skipping over any endpoints it catches. Now, the USB spec does not rule out endpoints having maxpacket = 0. But since they wouldn't have any practical use, there doesn't seem to be any good reason for us to accept them. Signed-off-by: Alan Stern Link: https://lore.kernel.org/r/Pine.LNX.4.44L0.1910281050420.1485-100000@iolanthe.rowland.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/core/config.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index d03d0e46b121d6..cfb8f1126cf828 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -348,6 +348,11 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum, /* Validate the wMaxPacketSize field */ maxp = usb_endpoint_maxp(&endpoint->desc); + if (maxp == 0) { + dev_warn(ddev, "config %d interface %d altsetting %d endpoint 0x%X has wMaxPacketSize 0, skipping\n", + cfgno, inum, asnum, d->bEndpointAddress); + goto skip_to_next_endpoint_or_interface_descriptor; + } /* Find the highest legal maxpacket size for this endpoint */ i = 0; /* additional transactions per microframe */ From 53746bf4e6220c4b40cf0f4c32a3d6a57dc43e1b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 22 Oct 2019 16:32:03 +0200 Subject: [PATCH 089/155] USB: ldusb: use unsigned size format specifiers [ Upstream commit 88f6bf3846ee90bf33aa1ce848cd3bfb3229f4a4 ] A recent info-leak bug manifested itself along with warning about a negative buffer overflow: ldusb 1-1:0.28: Read buffer overflow, -131383859965943 bytes dropped when it was really a rather large positive one. A sanity check that prevents this has now been put in place, but let's fix up the size format specifiers, which should all be unsigned. Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191022143203.5260-3-johan@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/misc/ldusb.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/usb/misc/ldusb.c b/drivers/usb/misc/ldusb.c index 6387545b17ea09..e16af177d46701 100644 --- a/drivers/usb/misc/ldusb.c +++ b/drivers/usb/misc/ldusb.c @@ -490,7 +490,7 @@ static ssize_t ld_usb_read(struct file *file, char __user *buffer, size_t count, } bytes_to_read = min(count, *actual_buffer); if (bytes_to_read < *actual_buffer) - dev_warn(&dev->intf->dev, "Read buffer overflow, %zd bytes dropped\n", + dev_warn(&dev->intf->dev, "Read buffer overflow, %zu bytes dropped\n", *actual_buffer-bytes_to_read); /* copy one interrupt_in_buffer from ring_buffer into userspace */ @@ -565,8 +565,9 @@ static ssize_t ld_usb_write(struct file *file, const char __user *buffer, /* write the data into interrupt_out_buffer from userspace */ bytes_to_write = min(count, write_buffer_size*dev->interrupt_out_endpoint_size); if (bytes_to_write < count) - dev_warn(&dev->intf->dev, "Write buffer overflow, %zd bytes dropped\n", count-bytes_to_write); - dev_dbg(&dev->intf->dev, "%s: count = %zd, bytes_to_write = %zd\n", + dev_warn(&dev->intf->dev, "Write buffer overflow, %zu bytes dropped\n", + count - bytes_to_write); + dev_dbg(&dev->intf->dev, "%s: count = %zu, bytes_to_write = %zu\n", __func__, count, bytes_to_write); if (copy_from_user(dev->interrupt_out_buffer, buffer, bytes_to_write)) { From 32eb2a0a19e51c2cb0f44c18c8a6c14bfde3a29a Mon Sep 17 00:00:00 2001 From: Potnuri Bharat Teja Date: Fri, 25 Oct 2019 18:04:40 +0530 Subject: [PATCH 090/155] RDMA/iw_cxgb4: Avoid freeing skb twice in arp failure case [ Upstream commit d4934f45693651ea15357dd6c7c36be28b6da884 ] _put_ep_safe() and _put_pass_ep_safe() free the skb before it is freed by process_work(). fix double free by freeing the skb only in process_work(). Fixes: 1dad0ebeea1c ("iw_cxgb4: Avoid touch after free error in ARP failure handlers") Link: https://lore.kernel.org/r/1572006880-5800-1-git-send-email-bharat@chelsio.com Signed-off-by: Dakshaja Uppalapati Signed-off-by: Potnuri Bharat Teja Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin --- drivers/infiniband/hw/cxgb4/cm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index d87f08cd78ad43..bb36cdf82a8d6c 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -491,7 +491,6 @@ static int _put_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb) ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *))); release_ep_resources(ep); - kfree_skb(skb); return 0; } @@ -502,7 +501,6 @@ static int _put_pass_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb) ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *))); c4iw_put_ep(&ep->parent_ep->com); release_ep_resources(ep); - kfree_skb(skb); return 0; } From 3c7e78186d68351ef5962a593ec40945a11da8bf Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 24 Oct 2019 16:38:04 +1000 Subject: [PATCH 091/155] scsi: qla2xxx: stop timer in shutdown path [ Upstream commit d3566abb1a1e7772116e4d50fb6a58d19c9802e5 ] In shutdown/reboot paths, the timer is not stopped: qla2x00_shutdown pci_device_shutdown device_shutdown kernel_restart_prepare kernel_restart sys_reboot This causes lockups (on powerpc) when firmware config space access calls are interrupted by smp_send_stop later in reboot. Fixes: e30d1756480dc ("[SCSI] qla2xxx: Addition of shutdown callback handler.") Link: https://lore.kernel.org/r/20191024063804.14538-1-npiggin@gmail.com Signed-off-by: Nicholas Piggin Acked-by: Himanshu Madhani Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/qla2xxx/qla_os.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 7d7fb5bbb60079..343fbaa6d2a2d3 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -3437,6 +3437,10 @@ qla2x00_shutdown(struct pci_dev *pdev) /* Stop currently executing firmware. */ qla2x00_try_to_stop_firmware(vha); + /* Disable timer */ + if (vha->timer_active) + qla2x00_stop_timer(vha); + /* Turn adapter off line */ vha->flags.online = 0; From 81370ee55d9250e0744e09493186cb0e071dbcb2 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 25 Oct 2019 12:06:02 +0100 Subject: [PATCH 092/155] fjes: Handle workqueue allocation failure [ Upstream commit 85ac30fa2e24f628e9f4f9344460f4015d33fd7d ] In the highly unlikely event that we fail to allocate either of the "/txrx" or "/control" workqueues, we should bail cleanly rather than blindly march on with NULL queue pointer(s) installed in the 'fjes_adapter' instance. Cc: "David S. Miller" Reported-by: Nicolas Waisman Link: https://lore.kernel.org/lkml/CADJ_3a8WFrs5NouXNqS5WYe7rebFP+_A5CheeqAyD_p7DFJJcg@mail.gmail.com/ Signed-off-by: Will Deacon Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/fjes/fjes_main.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c index 750954be5a7403..14d6579b292a86 100644 --- a/drivers/net/fjes/fjes_main.c +++ b/drivers/net/fjes/fjes_main.c @@ -1252,8 +1252,17 @@ static int fjes_probe(struct platform_device *plat_dev) adapter->open_guard = false; adapter->txrx_wq = alloc_workqueue(DRV_NAME "/txrx", WQ_MEM_RECLAIM, 0); + if (unlikely(!adapter->txrx_wq)) { + err = -ENOMEM; + goto err_free_netdev; + } + adapter->control_wq = alloc_workqueue(DRV_NAME "/control", WQ_MEM_RECLAIM, 0); + if (unlikely(!adapter->control_wq)) { + err = -ENOMEM; + goto err_free_txrx_wq; + } INIT_WORK(&adapter->tx_stall_task, fjes_tx_stall_task); INIT_WORK(&adapter->raise_intr_rxdata_task, @@ -1270,7 +1279,7 @@ static int fjes_probe(struct platform_device *plat_dev) hw->hw_res.irq = platform_get_irq(plat_dev, 0); err = fjes_hw_init(&adapter->hw); if (err) - goto err_free_netdev; + goto err_free_control_wq; /* setup MAC address (02:00:00:00:00:[epid])*/ netdev->dev_addr[0] = 2; @@ -1292,6 +1301,10 @@ static int fjes_probe(struct platform_device *plat_dev) err_hw_exit: fjes_hw_exit(&adapter->hw); +err_free_control_wq: + destroy_workqueue(adapter->control_wq); +err_free_txrx_wq: + destroy_workqueue(adapter->txrx_wq); err_free_netdev: free_netdev(netdev); err_out: From 15c1c15be2a8fd0ef005972adc9c2a7782a2f558 Mon Sep 17 00:00:00 2001 From: Jiangfeng Xiao Date: Fri, 25 Oct 2019 21:48:22 +0800 Subject: [PATCH 093/155] net: hisilicon: Fix "Trying to free already-free IRQ" [ Upstream commit 63a41746827cb16dc6ad0d4d761ab4e7dda7a0c3 ] When rmmod hip04_eth.ko, we can get the following warning: Task track: rmmod(1623)>bash(1591)>login(1581)>init(1) ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1623 at kernel/irq/manage.c:1557 __free_irq+0xa4/0x2ac() Trying to free already-free IRQ 200 Modules linked in: ping(O) pramdisk(O) cpuinfo(O) rtos_snapshot(O) interrupt_ctrl(O) mtdblock mtd_blkdevrtfs nfs_acl nfs lockd grace sunrpc xt_tcpudp ipt_REJECT iptable_filter ip_tables x_tables nf_reject_ipv CPU: 0 PID: 1623 Comm: rmmod Tainted: G O 4.4.193 #1 Hardware name: Hisilicon A15 [] (rtos_unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0xa0/0xd8) [] (dump_stack) from [] (warn_slowpath_common+0x84/0xb0) [] (warn_slowpath_common) from [] (warn_slowpath_fmt+0x3c/0x68) [] (warn_slowpath_fmt) from [] (__free_irq+0xa4/0x2ac) [] (__free_irq) from [] (free_irq+0x60/0x7c) [] (free_irq) from [] (release_nodes+0x1c4/0x1ec) [] (release_nodes) from [] (__device_release_driver+0xa8/0x104) [] (__device_release_driver) from [] (driver_detach+0xd0/0xf8) [] (driver_detach) from [] (bus_remove_driver+0x64/0x8c) [] (bus_remove_driver) from [] (SyS_delete_module+0x198/0x1e0) [] (SyS_delete_module) from [] (__sys_trace_return+0x0/0x10) ---[ end trace bb25d6123d849b44 ]--- Currently "rmmod hip04_eth.ko" call free_irq more than once as devres_release_all and hip04_remove both call free_irq. This results in a 'Trying to free already-free IRQ' warning. To solve the problem free_irq has been moved out of hip04_remove. Signed-off-by: Jiangfeng Xiao Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/hisilicon/hip04_eth.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c index 17cbe8145dcd98..ebc056b9a0fd25 100644 --- a/drivers/net/ethernet/hisilicon/hip04_eth.c +++ b/drivers/net/ethernet/hisilicon/hip04_eth.c @@ -945,7 +945,6 @@ static int hip04_remove(struct platform_device *pdev) hip04_free_ring(ndev, d); unregister_netdev(ndev); - free_irq(ndev->irq, ndev); of_node_put(priv->phy_node); cancel_work_sync(&priv->tx_timeout_task); free_netdev(ndev); From 904e41c939f48a6ab38db3009ac3f59805e56a86 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Wed, 30 Oct 2019 15:32:13 +0000 Subject: [PATCH 094/155] hv_netvsc: Fix error handling in netvsc_attach() [ Upstream commit 719b85c336ed35565d0f3982269d6f684087bb00 ] If rndis_filter_open() fails, we need to remove the rndis device created in earlier steps, before returning an error code. Otherwise, the retry of netvsc_attach() from its callers will fail and hang. Fixes: 7b2ee50c0cd5 ("hv_netvsc: common detach logic") Signed-off-by: Haiyang Zhang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/hyperv/netvsc_drv.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 33c1f6548fb795..5a44b979526690 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -969,7 +969,7 @@ static int netvsc_attach(struct net_device *ndev, if (netif_running(ndev)) { ret = rndis_filter_open(nvdev); if (ret) - return ret; + goto err; rdev = nvdev->extension; if (!rdev->link_state) @@ -977,6 +977,13 @@ static int netvsc_attach(struct net_device *ndev, } return 0; + +err: + netif_device_detach(ndev); + + rndis_filter_device_remove(hdev, nvdev); + + return ret; } static int netvsc_set_channels(struct net_device *net, From 640b29e01858c3cf28c04ca3eaee5380c6e7f6b1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 31 Oct 2019 18:40:32 -0400 Subject: [PATCH 095/155] NFSv4: Don't allow a cached open with a revoked delegation [ Upstream commit be3df3dd4c70ee020587a943a31b98a0fb4b6424 ] If the delegation is marked as being revoked, we must not use it for cached opens. Fixes: 869f9dfa4d6d ("NFSv4: Fix races between nfs_remove_bad_delegation() and delegation return") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- fs/nfs/delegation.c | 10 ++++++++++ fs/nfs/delegation.h | 1 + fs/nfs/nfs4proc.c | 7 ++----- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 606dd3871f66b0..61bc0a6ba08b19 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -52,6 +52,16 @@ nfs4_is_valid_delegation(const struct nfs_delegation *delegation, return false; } +struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode) +{ + struct nfs_delegation *delegation; + + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (nfs4_is_valid_delegation(delegation, 0)) + return delegation; + return NULL; +} + static int nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) { diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index ddaf2644cf13a1..df41d16dc6ab43 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -63,6 +63,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred); +struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); int nfs4_check_delegation(struct inode *inode, fmode_t flags); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index af062e9f45803f..f1526f65cc580c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1355,8 +1355,6 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode, return 0; if ((delegation->type & fmode) != fmode) return 0; - if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) - return 0; switch (claim) { case NFS4_OPEN_CLAIM_NULL: case NFS4_OPEN_CLAIM_FH: @@ -1615,7 +1613,6 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) { struct nfs4_state *state = opendata->state; - struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *delegation; int open_mode = opendata->o_arg.open_flags; fmode_t fmode = opendata->o_arg.fmode; @@ -1632,7 +1629,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) } spin_unlock(&state->owner->so_lock); rcu_read_lock(); - delegation = rcu_dereference(nfsi->delegation); + delegation = nfs4_get_valid_delegation(state->inode); if (!can_open_delegated(delegation, fmode, claim)) { rcu_read_unlock(); break; @@ -2153,7 +2150,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) data->o_arg.open_flags, claim)) goto out_no_action; rcu_read_lock(); - delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); + delegation = nfs4_get_valid_delegation(data->state->inode); if (can_open_delegated(delegation, data->o_arg.fmode, claim)) goto unlock_no_action; rcu_read_unlock(); From 93905fb6dc7efbec5f4c278da5ee7bb062657d4a Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Fri, 1 Nov 2019 20:17:25 +0800 Subject: [PATCH 096/155] net: ethernet: arc: add the missed clk_disable_unprepare [ Upstream commit 4202e219edd6cc164c042e16fa327525410705ae ] The remove misses to disable and unprepare priv->macclk like what is done when probe fails. Add the missed call in remove. Signed-off-by: Chuhong Yuan Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/arc/emac_rockchip.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/arc/emac_rockchip.c b/drivers/net/ethernet/arc/emac_rockchip.c index c770ca37c9b21e..a7d30731d376f1 100644 --- a/drivers/net/ethernet/arc/emac_rockchip.c +++ b/drivers/net/ethernet/arc/emac_rockchip.c @@ -261,6 +261,9 @@ static int emac_rockchip_remove(struct platform_device *pdev) if (priv->regulator) regulator_disable(priv->regulator); + if (priv->soc_data->need_div_macclk) + clk_disable_unprepare(priv->macclk); + free_netdev(ndev); return err; } From 4701d19f3e40813f4afbab27a379c2f62aa04cdf Mon Sep 17 00:00:00 2001 From: Manfred Rudigier Date: Thu, 15 Aug 2019 13:55:20 -0700 Subject: [PATCH 097/155] igb: Fix constant media auto sense switching when no cable is connected [ Upstream commit 8d5cfd7f76a2414e23c74bb8858af7540365d985 ] At least on the i350 there is an annoying behavior that is maybe also present on 82580 devices, but was probably not noticed yet as MAS is not widely used. If no cable is connected on both fiber/copper ports the media auto sense code will constantly swap between them as part of the watchdog task and produce many unnecessary kernel log messages. The swap code responsible for this behavior (switching to fiber) should not be executed if the current media type is copper and there is no signal detected on the fiber port. In this case we can safely wait until the AUTOSENSE_EN bit is cleared. Signed-off-by: Manfred Rudigier Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/igb/igb_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 71b235f935d940..9c7e75b3b6c7a6 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -1680,7 +1680,8 @@ static void igb_check_swap_media(struct igb_adapter *adapter) if ((hw->phy.media_type == e1000_media_type_copper) && (!(connsw & E1000_CONNSW_AUTOSENSE_EN))) { swap_now = true; - } else if (!(connsw & E1000_CONNSW_SERDESD)) { + } else if ((hw->phy.media_type != e1000_media_type_copper) && + !(connsw & E1000_CONNSW_SERDESD)) { /* copper signal takes time to appear */ if (adapter->copper_tries < 4) { adapter->copper_tries++; From f69f2d6119535c1f91ef37d0704d65fafee56ae0 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Mon, 12 Aug 2019 00:59:21 -0500 Subject: [PATCH 098/155] e1000: fix memory leaks [ Upstream commit 8472ba62154058b64ebb83d5f57259a352d28697 ] In e1000_set_ringparam(), 'tx_old' and 'rx_old' are not deallocated if e1000_up() fails, leading to memory leaks. Refactor the code to fix this issue. Signed-off-by: Wenwen Wang Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/e1000/e1000_ethtool.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000/e1000_ethtool.c b/drivers/net/ethernet/intel/e1000/e1000_ethtool.c index 10df2d60c1814b..88b34f72233752 100644 --- a/drivers/net/ethernet/intel/e1000/e1000_ethtool.c +++ b/drivers/net/ethernet/intel/e1000/e1000_ethtool.c @@ -627,6 +627,7 @@ static int e1000_set_ringparam(struct net_device *netdev, for (i = 0; i < adapter->num_rx_queues; i++) rxdr[i].count = rxdr->count; + err = 0; if (netif_running(adapter->netdev)) { /* Try to get new resources before deleting old */ err = e1000_setup_all_rx_resources(adapter); @@ -647,14 +648,13 @@ static int e1000_set_ringparam(struct net_device *netdev, adapter->rx_ring = rxdr; adapter->tx_ring = txdr; err = e1000_up(adapter); - if (err) - goto err_setup; } kfree(tx_old); kfree(rx_old); clear_bit(__E1000_RESETTING, &adapter->flags); - return 0; + return err; + err_setup_tx: e1000_free_all_rx_resources(adapter); err_setup_rx: @@ -666,7 +666,6 @@ static int e1000_set_ringparam(struct net_device *netdev, err_alloc_tx: if (netif_running(adapter->netdev)) e1000_up(adapter); -err_setup: clear_bit(__E1000_RESETTING, &adapter->flags); return err; } From fe241fe90ac12bc7fa52d9bded4b253c4ee4a27d Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Thu, 1 Mar 2018 13:59:28 +0800 Subject: [PATCH 099/155] x86/apic: Move pending interrupt check code into it's own function [ Upstream commit 9b217f33017715903d0956dfc58f82d2a2d00e63 ] The pending interrupt check code is mixed with the local APIC setup code, that looks messy. Extract the related code, move it into a new function named apic_pending_intr_clear(). Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Reviewed-by: Andy Shevchenko Cc: bhe@redhat.com Cc: ebiederm@xmission.com Link: https://lkml.kernel.org/r/20180301055930.2396-2-douly.fnst@cn.fujitsu.com Signed-off-by: Sasha Levin --- arch/x86/kernel/apic/apic.c | 100 ++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 45 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ea2de324ab0216..98fecdbec64024 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1362,6 +1362,56 @@ static void lapic_setup_esr(void) oldvalue, value); } +static void apic_pending_intr_clear(void) +{ + long long max_loops = cpu_khz ? cpu_khz : 1000000; + unsigned long long tsc = 0, ntsc; + unsigned int value, queued; + int i, j, acked = 0; + + if (boot_cpu_has(X86_FEATURE_TSC)) + tsc = rdtsc(); + /* + * After a crash, we no longer service the interrupts and a pending + * interrupt from previous kernel might still have ISR bit set. + * + * Most probably by now CPU has serviced that pending interrupt and + * it might not have done the ack_APIC_irq() because it thought, + * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it + * does not clear the ISR bit and cpu thinks it has already serivced + * the interrupt. Hence a vector might get locked. It was noticed + * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. + */ + do { + queued = 0; + for (i = APIC_ISR_NR - 1; i >= 0; i--) + queued |= apic_read(APIC_IRR + i*0x10); + + for (i = APIC_ISR_NR - 1; i >= 0; i--) { + value = apic_read(APIC_ISR + i*0x10); + for (j = 31; j >= 0; j--) { + if (value & (1< 256) { + printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n", + acked); + break; + } + if (queued) { + if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) { + ntsc = rdtsc(); + max_loops = (cpu_khz << 10) - (ntsc - tsc); + } else + max_loops--; + } + } while (queued && max_loops > 0); + WARN_ON(max_loops <= 0); +} + /** * setup_local_APIC - setup the local APIC * @@ -1371,13 +1421,11 @@ static void lapic_setup_esr(void) void setup_local_APIC(void) { int cpu = smp_processor_id(); - unsigned int value, queued; - int i, j, acked = 0; - unsigned long long tsc = 0, ntsc; - long long max_loops = cpu_khz ? cpu_khz : 1000000; + unsigned int value; +#ifdef CONFIG_X86_32 + int i; +#endif - if (boot_cpu_has(X86_FEATURE_TSC)) - tsc = rdtsc(); if (disable_apic) { disable_ioapic_support(); @@ -1437,45 +1485,7 @@ void setup_local_APIC(void) value &= ~APIC_TPRI_MASK; apic_write(APIC_TASKPRI, value); - /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. - * - * Most probably by now CPU has serviced that pending interrupt and - * it might not have done the ack_APIC_irq() because it thought, - * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it - * does not clear the ISR bit and cpu thinks it has already serivced - * the interrupt. Hence a vector might get locked. It was noticed - * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. - */ - do { - queued = 0; - for (i = APIC_ISR_NR - 1; i >= 0; i--) - queued |= apic_read(APIC_IRR + i*0x10); - - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for (j = 31; j >= 0; j--) { - if (value & (1< 256) { - printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n", - acked); - break; - } - if (queued) { - if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) { - ntsc = rdtsc(); - max_loops = (cpu_khz << 10) - (ntsc - tsc); - } else - max_loops--; - } - } while (queued && max_loops > 0); - WARN_ON(max_loops <= 0); + apic_pending_intr_clear(); /* * Now that we are all set up, enable the APIC From 7f2af6c87ee70baa1d755c13f331bfcd546170f1 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Thu, 1 Mar 2018 13:59:30 +0800 Subject: [PATCH 100/155] x86/apic: Drop logical_smp_processor_id() inline [ Upstream commit 8f1561680f42a5491b371b513f1ab8197f31fd62 ] The logical_smp_processor_id() inline which is only called in setup_local_APIC() on x86_32 systems has no real value. Drop it and directly use GET_APIC_LOGICAL_ID() at the call site and use a more suitable variable name for readability Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: andy.shevchenko@gmail.com Cc: bhe@redhat.com Cc: ebiederm@xmission.com Link: https://lkml.kernel.org/r/20180301055930.2396-4-douly.fnst@cn.fujitsu.com Signed-off-by: Sasha Levin --- arch/x86/include/asm/smp.h | 10 ---------- arch/x86/kernel/apic/apic.c | 10 +++++----- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index fe2ee61880a86a..a14730ca9d1e2e 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -175,16 +175,6 @@ static inline int wbinvd_on_all_cpus(void) extern unsigned disabled_cpus; #ifdef CONFIG_X86_LOCAL_APIC - -#ifndef CONFIG_X86_64 -static inline int logical_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); -} - -#endif - extern int hard_smp_processor_id(void); #else /* CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 98fecdbec64024..97d1290d1f0d82 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1423,7 +1423,7 @@ void setup_local_APIC(void) int cpu = smp_processor_id(); unsigned int value; #ifdef CONFIG_X86_32 - int i; + int logical_apicid, ldr_apicid; #endif @@ -1470,11 +1470,11 @@ void setup_local_APIC(void) * initialized during get_smp_config(), make sure it matches the * actual value. */ - i = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - WARN_ON(i != BAD_APICID && i != logical_smp_processor_id()); + logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + WARN_ON(logical_apicid != BAD_APICID && logical_apicid != ldr_apicid); /* always use the value from LDR */ - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = - logical_smp_processor_id(); + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; #endif /* From 5bbc8cba1d1729b9427656bb134943c3b868a3a9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 29 Oct 2019 10:34:19 +0100 Subject: [PATCH 101/155] x86/apic/32: Avoid bogus LDR warnings [ Upstream commit fe6f85ca121e9c74e7490fe66b0c5aae38e332c3 ] The removal of the LDR initialization in the bigsmp_32 APIC code unearthed a problem in setup_local_APIC(). The code checks unconditionally for a mismatch of the logical APIC id by comparing the early APIC id which was initialized in get_smp_config() with the actual LDR value in the APIC. Due to the removal of the bogus LDR initialization the check now can trigger on bigsmp_32 APIC systems emitting a warning for every booting CPU. This is of course a false positive because the APIC is not using logical destination mode. Restrict the check and the possibly resulting fixup to systems which are actually using the APIC in logical destination mode. [ tglx: Massaged changelog and added Cc stable ] Fixes: bae3a8d3308 ("x86/apic: Do not initialize LDR and DFR for bigsmp") Signed-off-by: Jan Beulich Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/666d8f91-b5a8-1afd-7add-821e72a35f03@suse.com Signed-off-by: Sasha Levin --- arch/x86/kernel/apic/apic.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 97d1290d1f0d82..6415b4aead5460 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1422,9 +1422,6 @@ void setup_local_APIC(void) { int cpu = smp_processor_id(); unsigned int value; -#ifdef CONFIG_X86_32 - int logical_apicid, ldr_apicid; -#endif if (disable_apic) { @@ -1465,16 +1462,21 @@ void setup_local_APIC(void) apic->init_apic_ldr(); #ifdef CONFIG_X86_32 - /* - * APIC LDR is initialized. If logical_apicid mapping was - * initialized during get_smp_config(), make sure it matches the - * actual value. - */ - logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); - WARN_ON(logical_apicid != BAD_APICID && logical_apicid != ldr_apicid); - /* always use the value from LDR */ - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; + if (apic->dest_logical) { + int logical_apicid, ldr_apicid; + + /* + * APIC LDR is initialized. If logical_apicid mapping was + * initialized during get_smp_config(), make sure it matches + * the actual value. + */ + logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + if (logical_apicid != BAD_APICID) + WARN_ON(logical_apicid != ldr_apicid); + /* Always use the value from LDR. */ + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; + } #endif /* From 665b71ac70e2320634059118221f2e949f21609d Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Thu, 15 Aug 2019 08:00:26 +0000 Subject: [PATCH 102/155] can: flexcan: disable completely the ECC mechanism [ Upstream commit 5e269324db5adb2f5f6ec9a93a9c7b0672932b47 ] The ECC (memory error detection and correction) mechanism can be activated or not, controlled by the ECCDIS bit in CAN_MECR. When disabled, updates on indications and reporting registers are stopped. So if want to disable ECC completely, had better assert ECCDIS bit, not just mask the related interrupts. Fixes: cdce844865be ("can: flexcan: add vf610 support for FlexCAN") Signed-off-by: Joakim Zhang Cc: linux-stable Signed-off-by: Marc Kleine-Budde Signed-off-by: Sasha Levin --- drivers/net/can/flexcan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index 7280f3a8aa041a..84dd79041285a0 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -1018,6 +1018,7 @@ static int flexcan_chip_start(struct net_device *dev) reg_mecr = flexcan_read(®s->mecr); reg_mecr &= ~FLEXCAN_MECR_ECRWRDIS; flexcan_write(reg_mecr, ®s->mecr); + reg_mecr |= FLEXCAN_MECR_ECCDIS; reg_mecr &= ~(FLEXCAN_MECR_NCEFAFRZ | FLEXCAN_MECR_HANCEI_MSK | FLEXCAN_MECR_FANCEI_MSK); flexcan_write(reg_mecr, ®s->mecr); From 42364a188ba5225cffdeee6c17c7f9af1a909cbd Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 23 Sep 2019 15:34:45 -0700 Subject: [PATCH 103/155] mm/filemap.c: don't initiate writeback if mapping has no dirty pages commit c3aab9a0bd91b696a852169479b7db1ece6cbf8c upstream. Functions like filemap_write_and_wait_range() should do nothing if inode has no dirty pages or pages currently under writeback. But they anyway construct struct writeback_control and this does some atomic operations if CONFIG_CGROUP_WRITEBACK=y - on fast path it locks inode->i_lock and updates state of writeback ownership, on slow path might be more work. Current this path is safely avoided only when inode mapping has no pages. For example generic_file_read_iter() calls filemap_write_and_wait_range() at each O_DIRECT read - pretty hot path. This patch skips starting new writeback if mapping has no dirty tags set. If writeback is already in progress filemap_write_and_wait_range() will wait for it. Link: http://lkml.kernel.org/r/156378816804.1087.8607636317907921438.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Jan Kara Cc: Tejun Heo Cc: Jens Axboe Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 938365ad7e99aa..a30dbf93de9923 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -338,7 +338,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, .range_end = end, }; - if (!mapping_cap_writeback_dirty(mapping)) + if (!mapping_cap_writeback_dirty(mapping) || + !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; wbc_attach_fdatawrite_inode(&wbc, mapping->host); From d527ab46d14451ec6a6257f706dccac3a3089133 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 Nov 2019 12:18:29 -0800 Subject: [PATCH 104/155] cgroup,writeback: don't switch wbs immediately on dead wbs if the memcg is dead commit 65de03e251382306a4575b1779c57c87889eee49 upstream. cgroup writeback tries to refresh the associated wb immediately if the current wb is dead. This is to avoid keeping issuing IOs on the stale wb after memcg - blkcg association has changed (ie. when blkcg got disabled / enabled higher up in the hierarchy). Unfortunately, the logic gets triggered spuriously on inodes which are associated with dead cgroups. When the logic is triggered on dead cgroups, the attempt fails only after doing quite a bit of work allocating and initializing a new wb. While c3aab9a0bd91 ("mm/filemap.c: don't initiate writeback if mapping has no dirty pages") alleviated the issue significantly as it now only triggers when the inode has dirty pages. However, the condition can still be triggered before the inode is switched to a different cgroup and the logic simply doesn't make sense. Skip the immediate switching if the associated memcg is dying. This is a simplified version of the following two patches: * https://lore.kernel.org/linux-mm/20190513183053.GA73423@dennisz-mbp/ * http://lkml.kernel.org/r/156355839560.2063.5265687291430814589.stgit@buzz Cc: Konstantin Khlebnikov Fixes: e8a7abf5a5bd ("writeback: disassociate inodes from dying bdi_writebacks") Acked-by: Dennis Zhou Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/fs-writeback.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9e8fde348d6147..6398bd8a066eb1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -582,10 +582,13 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, spin_unlock(&inode->i_lock); /* - * A dying wb indicates that the memcg-blkcg mapping has changed - * and a new wb is already serving the memcg. Switch immediately. + * A dying wb indicates that either the blkcg associated with the + * memcg changed or the associated memcg is dying. In the first + * case, a replacement wb should already be available and we should + * refresh the wb immediately. In the second case, trying to + * refresh will keep failing. */ - if (unlikely(wb_dying(wbc->wb))) + if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } From 852a503230d9acc386a4c7b4e4b5c252f62c90c9 Mon Sep 17 00:00:00 2001 From: Suwan Kim Date: Tue, 22 Oct 2019 18:30:17 +0900 Subject: [PATCH 105/155] usbip: Fix free of unallocated memory in vhci tx [ Upstream commit d4d8257754c3300ea2a465dadf8d2b02c713c920 ] iso_buffer should be set to NULL after use and free in the while loop. In the case of isochronous URB in the while loop, iso_buffer is allocated and after sending it to server, buffer is deallocated. And then, if the next URB in the while loop is not a isochronous pipe, iso_buffer still holds the previously deallocated buffer address and kfree tries to free wrong buffer address. Fixes: ea44d190764b ("usbip: Implement SG support to vhci-hcd and stub driver") Reported-by: kbuild test robot Reported-by: Julia Lawall Signed-off-by: Suwan Kim Reviewed-by: Julia Lawall Acked-by: Shuah Khan Link: https://lore.kernel.org/r/20191022093017.8027-1-suwan.kim027@gmail.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/usbip/vhci_tx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/usbip/vhci_tx.c b/drivers/usb/usbip/vhci_tx.c index 93c139d884f342..682127d258fdd5 100644 --- a/drivers/usb/usbip/vhci_tx.c +++ b/drivers/usb/usbip/vhci_tx.c @@ -161,7 +161,10 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev) } kfree(iov); + /* This is only for isochronous case */ kfree(iso_buffer); + iso_buffer = NULL; + usbip_dbg_vhci_tx("send txdata\n"); total_size += txsize; From 47598ed4ad3cbf2102fe685e516dba1d61ad252d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Nov 2019 21:38:43 -0800 Subject: [PATCH 106/155] net: prevent load/store tearing on sk->sk_stamp [ Upstream commit f75359f3ac855940c5718af10ba089b8977bf339 ] Add a couple of READ_ONCE() and WRITE_ONCE() to prevent load-tearing and store-tearing in sock_read_timestamp() and sock_write_timestamp() This might prevent another KCSAN report. Fixes: 3a0ed3e96197 ("sock: Make sock->sk_stamp thread-safe") Signed-off-by: Eric Dumazet Cc: Deepa Dinamani Acked-by: Deepa Dinamani Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 7ec4d0bd8d12fb..780c6c0a86f04d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2229,7 +2229,7 @@ static inline ktime_t sock_read_timestamp(struct sock *sk) return kt; #else - return sk->sk_stamp; + return READ_ONCE(sk->sk_stamp); #endif } @@ -2240,7 +2240,7 @@ static inline void sock_write_timestamp(struct sock *sk, ktime_t kt) sk->sk_stamp = kt; write_sequnlock(&sk->sk_stamp_seq); #else - sk->sk_stamp = kt; + WRITE_ONCE(sk->sk_stamp, kt); #endif } From b8fa42e6cfbf8220b90d054af7ebb1de250ea9ea Mon Sep 17 00:00:00 2001 From: Jon Bloomfield Date: Thu, 12 Jul 2018 19:53:10 +0100 Subject: [PATCH 107/155] drm/i915/gtt: Add read only pages to gen8_pte_encode commit 25dda4dabeeb12af5209b0183c788ef2a88dabbe upstream. We can set a bit inside the ppGTT PTE to indicate a page is read-only; writes from the GPU will be discarded. We can use this to protect pages and in particular support read-only userptr mappings (necessary for importing PROT_READ vma). Signed-off-by: Jon Bloomfield Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: Matthew Auld Reviewed-by: Joonas Lahtinen Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20180712185315.3288-1-chris@chris-wilson.co.uk Signed-off-by: Rodrigo Vivi Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_gem_gtt.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index ad524cb0f6fc5a..d8d6632abe77ef 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -223,10 +223,13 @@ static void ppgtt_unbind_vma(struct i915_vma *vma) } static gen8_pte_t gen8_pte_encode(dma_addr_t addr, - enum i915_cache_level level) + enum i915_cache_level level, + u32 flags) { - gen8_pte_t pte = _PAGE_PRESENT | _PAGE_RW; - pte |= addr; + gen8_pte_t pte = addr | _PAGE_PRESENT | _PAGE_RW; + + if (unlikely(flags & PTE_READ_ONLY)) + pte &= ~_PAGE_RW; switch (level) { case I915_CACHE_NONE: @@ -487,7 +490,7 @@ static void gen8_initialize_pt(struct i915_address_space *vm, struct i915_page_table *pt) { fill_px(vm, pt, - gen8_pte_encode(vm->scratch_page.daddr, I915_CACHE_LLC)); + gen8_pte_encode(vm->scratch_page.daddr, I915_CACHE_LLC, 0)); } static void gen6_initialize_pt(struct i915_address_space *vm, @@ -691,7 +694,7 @@ static bool gen8_ppgtt_clear_pt(struct i915_address_space *vm, unsigned int pte = gen8_pte_index(start); unsigned int pte_end = pte + num_entries; const gen8_pte_t scratch_pte = - gen8_pte_encode(vm->scratch_page.daddr, I915_CACHE_LLC); + gen8_pte_encode(vm->scratch_page.daddr, I915_CACHE_LLC, 0); gen8_pte_t *vaddr; GEM_BUG_ON(num_entries > pt->used_ptes); @@ -866,7 +869,7 @@ gen8_ppgtt_insert_pte_entries(struct i915_hw_ppgtt *ppgtt, enum i915_cache_level cache_level) { struct i915_page_directory *pd; - const gen8_pte_t pte_encode = gen8_pte_encode(0, cache_level); + const gen8_pte_t pte_encode = gen8_pte_encode(0, cache_level, 0); gen8_pte_t *vaddr; bool ret; @@ -1264,7 +1267,7 @@ static void gen8_dump_ppgtt(struct i915_hw_ppgtt *ppgtt, struct seq_file *m) { struct i915_address_space *vm = &ppgtt->base; const gen8_pte_t scratch_pte = - gen8_pte_encode(vm->scratch_page.daddr, I915_CACHE_LLC); + gen8_pte_encode(vm->scratch_page.daddr, I915_CACHE_LLC, 0); u64 start = 0, length = ppgtt->base.total; if (use_4lvl(vm)) { @@ -2078,7 +2081,7 @@ static void gen8_ggtt_insert_page(struct i915_address_space *vm, gen8_pte_t __iomem *pte = (gen8_pte_t __iomem *)ggtt->gsm + (offset >> PAGE_SHIFT); - gen8_set_pte(pte, gen8_pte_encode(addr, level)); + gen8_set_pte(pte, gen8_pte_encode(addr, level, 0)); ggtt->invalidate(vm->i915); } @@ -2091,7 +2094,7 @@ static void gen8_ggtt_insert_entries(struct i915_address_space *vm, struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); struct sgt_iter sgt_iter; gen8_pte_t __iomem *gtt_entries; - const gen8_pte_t pte_encode = gen8_pte_encode(0, level); + const gen8_pte_t pte_encode = gen8_pte_encode(0, level, 0); dma_addr_t addr; gtt_entries = (gen8_pte_t __iomem *)ggtt->gsm; @@ -2162,7 +2165,7 @@ static void gen8_ggtt_clear_range(struct i915_address_space *vm, unsigned first_entry = start >> PAGE_SHIFT; unsigned num_entries = length >> PAGE_SHIFT; const gen8_pte_t scratch_pte = - gen8_pte_encode(vm->scratch_page.daddr, I915_CACHE_LLC); + gen8_pte_encode(vm->scratch_page.daddr, I915_CACHE_LLC, 0); gen8_pte_t __iomem *gtt_base = (gen8_pte_t __iomem *)ggtt->gsm + first_entry; const int max_entries = ggtt_total_entries(ggtt) - first_entry; From 0a41eb61a26b19fa7172c41410126ec1058769e1 Mon Sep 17 00:00:00 2001 From: "Vivi, Rodrigo" Date: Mon, 6 Aug 2018 14:10:48 -0700 Subject: [PATCH 108/155] drm/i915/gtt: Read-only pages for insert_entries on bdw+ commit 250f8c8140ac0a5e5acb91891d6813f12778b224 upstream. Hook up the flags to allow read-only ppGTT mappings for gen8+ v2: Include a selftest to check that writes to a readonly PTE are dropped v3: Don't duplicate cpu_check() as we can just reuse it, and even worse don't wholesale copy the theory-of-operation comment from igt_ctx_exec without changing it to explain the intention behind the new test! v4: Joonas really likes magic mystery values Signed-off-by: Jon Bloomfield Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: Matthew Auld Reviewed-by: Joonas Lahtinen Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20180712185315.3288-2-chris@chris-wilson.co.uk Signed-off-by: Rodrigo Vivi Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_gem_gtt.c | 37 ++++++++++++++++--------- drivers/gpu/drm/i915/i915_gem_gtt.h | 7 ++++- drivers/gpu/drm/i915/intel_ringbuffer.c | 11 ++++++-- 3 files changed, 38 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index d8d6632abe77ef..bc79d952b81a4c 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -207,7 +207,7 @@ static int ppgtt_bind_vma(struct i915_vma *vma, vma->pages = vma->obj->mm.pages; - /* Currently applicable only to VLV */ + /* Applicable to VLV, and gen8+ */ pte_flags = 0; if (vma->obj->gt_ro) pte_flags |= PTE_READ_ONLY; @@ -866,10 +866,11 @@ gen8_ppgtt_insert_pte_entries(struct i915_hw_ppgtt *ppgtt, struct i915_page_directory_pointer *pdp, struct sgt_dma *iter, struct gen8_insert_pte *idx, - enum i915_cache_level cache_level) + enum i915_cache_level cache_level, + u32 flags) { struct i915_page_directory *pd; - const gen8_pte_t pte_encode = gen8_pte_encode(0, cache_level, 0); + const gen8_pte_t pte_encode = gen8_pte_encode(0, cache_level, flags); gen8_pte_t *vaddr; bool ret; @@ -920,20 +921,20 @@ gen8_ppgtt_insert_pte_entries(struct i915_hw_ppgtt *ppgtt, static void gen8_ppgtt_insert_3lvl(struct i915_address_space *vm, struct i915_vma *vma, enum i915_cache_level cache_level, - u32 unused) + u32 flags) { struct i915_hw_ppgtt *ppgtt = i915_vm_to_ppgtt(vm); struct sgt_dma iter = sgt_dma(vma); struct gen8_insert_pte idx = gen8_insert_pte(vma->node.start); gen8_ppgtt_insert_pte_entries(ppgtt, &ppgtt->pdp, &iter, &idx, - cache_level); + cache_level, flags); } static void gen8_ppgtt_insert_4lvl(struct i915_address_space *vm, struct i915_vma *vma, enum i915_cache_level cache_level, - u32 unused) + u32 flags) { struct i915_hw_ppgtt *ppgtt = i915_vm_to_ppgtt(vm); struct sgt_dma iter = sgt_dma(vma); @@ -941,7 +942,7 @@ static void gen8_ppgtt_insert_4lvl(struct i915_address_space *vm, struct gen8_insert_pte idx = gen8_insert_pte(vma->node.start); while (gen8_ppgtt_insert_pte_entries(ppgtt, pdps[idx.pml4e++], &iter, - &idx, cache_level)) + &idx, cache_level, flags)) GEM_BUG_ON(idx.pml4e >= GEN8_PML4ES_PER_PML4); } @@ -1342,6 +1343,9 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt) return ret; } + /* From bdw, there is support for read-only pages in the PPGTT */ + ppgtt->base.has_read_only = true; + /* There are only few exceptions for gen >=6. chv and bxt. * And we are not sure about the latter so play safe for now. */ @@ -2089,7 +2093,7 @@ static void gen8_ggtt_insert_page(struct i915_address_space *vm, static void gen8_ggtt_insert_entries(struct i915_address_space *vm, struct i915_vma *vma, enum i915_cache_level level, - u32 unused) + u32 flags) { struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); struct sgt_iter sgt_iter; @@ -2097,6 +2101,9 @@ static void gen8_ggtt_insert_entries(struct i915_address_space *vm, const gen8_pte_t pte_encode = gen8_pte_encode(0, level, 0); dma_addr_t addr; + /* The GTT does not support read-only mappings */ + GEM_BUG_ON(flags & PTE_READ_ONLY); + gtt_entries = (gen8_pte_t __iomem *)ggtt->gsm; gtt_entries += vma->node.start >> PAGE_SHIFT; for_each_sgt_dma(addr, sgt_iter, vma->pages) @@ -2226,13 +2233,14 @@ struct insert_entries { struct i915_address_space *vm; struct i915_vma *vma; enum i915_cache_level level; + u32 flags; }; static int bxt_vtd_ggtt_insert_entries__cb(void *_arg) { struct insert_entries *arg = _arg; - gen8_ggtt_insert_entries(arg->vm, arg->vma, arg->level, 0); + gen8_ggtt_insert_entries(arg->vm, arg->vma, arg->level, arg->flags); bxt_vtd_ggtt_wa(arg->vm); return 0; @@ -2241,9 +2249,9 @@ static int bxt_vtd_ggtt_insert_entries__cb(void *_arg) static void bxt_vtd_ggtt_insert_entries__BKL(struct i915_address_space *vm, struct i915_vma *vma, enum i915_cache_level level, - u32 unused) + u32 flags) { - struct insert_entries arg = { vm, vma, level }; + struct insert_entries arg = { vm, vma, level, flags }; stop_machine(bxt_vtd_ggtt_insert_entries__cb, &arg, NULL); } @@ -2340,7 +2348,7 @@ static int ggtt_bind_vma(struct i915_vma *vma, return ret; } - /* Currently applicable only to VLV */ + /* Applicable to VLV (gen8+ do not support RO in the GGTT) */ pte_flags = 0; if (obj->gt_ro) pte_flags |= PTE_READ_ONLY; @@ -3066,6 +3074,10 @@ int i915_ggtt_init_hw(struct drm_i915_private *dev_priv) */ mutex_lock(&dev_priv->drm.struct_mutex); i915_address_space_init(&ggtt->base, dev_priv, "[global]"); + + /* Only VLV supports read-only GGTT mappings */ + ggtt->base.has_read_only = IS_VALLEYVIEW(dev_priv); + if (!HAS_LLC(dev_priv) && !USES_PPGTT(dev_priv)) ggtt->base.mm.color_adjust = i915_gtt_color_adjust; mutex_unlock(&dev_priv->drm.struct_mutex); @@ -3098,7 +3110,6 @@ int i915_ggtt_enable_hw(struct drm_i915_private *dev_priv) { if (INTEL_GEN(dev_priv) < 6 && !intel_enable_gtt()) return -EIO; - return 0; } diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h index 0dbbe840f5f01c..4b63d6cbd81e29 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.h +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h @@ -295,7 +295,12 @@ struct i915_address_space { struct list_head unbound_list; struct pagevec free_pages; - bool pt_kmap_wc; + + /* Some systems require uncached updates of the page directories */ + bool pt_kmap_wc:1; + + /* Some systems support read-only mappings for GGTT and/or PPGTT */ + bool has_read_only:1; /* FIXME: Need a more generic return type */ gen6_pte_t (*pte_encode)(dma_addr_t addr, diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index cdf084ef5aaeaf..19fe2489065a41 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1358,6 +1358,7 @@ void intel_ring_unpin(struct intel_ring *ring) static struct i915_vma * intel_ring_create_vma(struct drm_i915_private *dev_priv, int size) { + struct i915_address_space *vm = &dev_priv->ggtt.base; struct drm_i915_gem_object *obj; struct i915_vma *vma; @@ -1367,10 +1368,14 @@ intel_ring_create_vma(struct drm_i915_private *dev_priv, int size) if (IS_ERR(obj)) return ERR_CAST(obj); - /* mark ring buffers as read-only from GPU side by default */ - obj->gt_ro = 1; + /* + * Mark ring buffers as read-only from GPU side (so no stray overwrites) + * if supported by the platform's GGTT. + */ + if (vm->has_read_only) + obj->gt_ro = 1; - vma = i915_vma_instance(obj, &dev_priv->ggtt.base, NULL); + vma = i915_vma_instance(obj, vm, NULL); if (IS_ERR(vma)) goto err; From 9d41e1d06a46c20ea0ddf25a366192a9f7810f8e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 12 Jul 2018 19:53:12 +0100 Subject: [PATCH 109/155] drm/i915/gtt: Disable read-only support under GVT commit c9e666880de5a1fed04dc412b046916d542b72dd upstream. GVT is not propagating the PTE bits, and is always setting the read-write bit, thus breaking read-only support. Signed-off-by: Chris Wilson Cc: Zhenyu Wang Cc: Jon Bloomfield Cc: Joonas Lahtinen Cc: Matthew Auld Reviewed-by: Jon Bloomfield Link: https://patchwork.freedesktop.org/patch/msgid/20180712185315.3288-3-chris@chris-wilson.co.uk Signed-off-by: Rodrigo Vivi Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_gem_gtt.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index bc79d952b81a4c..920d064e9b53ee 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -1343,8 +1343,12 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt) return ret; } - /* From bdw, there is support for read-only pages in the PPGTT */ - ppgtt->base.has_read_only = true; + /* + * From bdw, there is support for read-only pages in the PPGTT. + * + * XXX GVT is not honouring the lack of RW in the PTE bits. + */ + ppgtt->base.has_read_only = !intel_vgpu_active(dev_priv); /* There are only few exceptions for gen >=6. chv and bxt. * And we are not sure about the latter so play safe for now. From d24b5d0d6d5f0763431350bf7b2b1a256baec692 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 12 Jul 2018 19:53:13 +0100 Subject: [PATCH 110/155] drm/i915: Prevent writing into a read-only object via a GGTT mmap commit 3e977ac6179b39faa3c0eda5fce4f00663ae298d upstream. If the user has created a read-only object, they should not be allowed to circumvent the write protection by using a GGTT mmapping. Deny it. Also most machines do not support read-only GGTT PTEs, so again we have to reject attempted writes. Fortunately, this is known a priori, so we can at least reject in the call to create the mmap (with a sanity check in the fault handler). v2: Check the vma->vm_flags during mmap() to allow readonly access. v3: Remove VM_MAYWRITE to curtail mprotect() Testcase: igt/gem_userptr_blits/readonly_mmap* Signed-off-by: Chris Wilson Cc: Jon Bloomfield Cc: Joonas Lahtinen Cc: Matthew Auld Cc: David Herrmann Reviewed-by: Matthew Auld #v1 Reviewed-by: Jon Bloomfield Reviewed-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20180712185315.3288-4-chris@chris-wilson.co.uk Signed-off-by: Rodrigo Vivi Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/drm_gem.c | 9 +++++++++ drivers/gpu/drm/i915/i915_gem.c | 4 ++++ drivers/gpu/drm/i915/i915_gem_gtt.c | 12 +++++++----- drivers/gpu/drm/i915/i915_gem_object.h | 13 ++++++++++++- drivers/gpu/drm/i915/intel_ringbuffer.c | 2 +- include/drm/drm_vma_manager.h | 1 + 6 files changed, 34 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index c55f338e380b9d..d2c042af36b8d1 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -1035,6 +1035,15 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma) return -EACCES; } + if (node->readonly) { + if (vma->vm_flags & VM_WRITE) { + drm_gem_object_put_unlocked(obj); + return -EINVAL; + } + + vma->vm_flags &= ~VM_MAYWRITE; + } + ret = drm_gem_mmap_obj(obj, drm_vma_node_size(node) << PAGE_SHIFT, vma); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 727018a16cca24..f95e2c8ac239a7 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1834,6 +1834,10 @@ int i915_gem_fault(struct vm_fault *vmf) unsigned int flags; int ret; + /* Sanity check that we allow writing into this object */ + if (i915_gem_object_is_readonly(obj) && write) + return VM_FAULT_SIGBUS; + /* We don't use vmf->pgoff since that has the fake offset */ page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT; diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index 920d064e9b53ee..6f5e1e18e530af 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -209,7 +209,7 @@ static int ppgtt_bind_vma(struct i915_vma *vma, /* Applicable to VLV, and gen8+ */ pte_flags = 0; - if (vma->obj->gt_ro) + if (i915_gem_object_is_readonly(vma->obj)) pte_flags |= PTE_READ_ONLY; vma->vm->insert_entries(vma->vm, vma, cache_level, pte_flags); @@ -2105,8 +2105,10 @@ static void gen8_ggtt_insert_entries(struct i915_address_space *vm, const gen8_pte_t pte_encode = gen8_pte_encode(0, level, 0); dma_addr_t addr; - /* The GTT does not support read-only mappings */ - GEM_BUG_ON(flags & PTE_READ_ONLY); + /* + * Note that we ignore PTE_READ_ONLY here. The caller must be careful + * not to allow the user to override access to a read only page. + */ gtt_entries = (gen8_pte_t __iomem *)ggtt->gsm; gtt_entries += vma->node.start >> PAGE_SHIFT; @@ -2354,7 +2356,7 @@ static int ggtt_bind_vma(struct i915_vma *vma, /* Applicable to VLV (gen8+ do not support RO in the GGTT) */ pte_flags = 0; - if (obj->gt_ro) + if (i915_gem_object_is_readonly(obj)) pte_flags |= PTE_READ_ONLY; intel_runtime_pm_get(i915); @@ -2396,7 +2398,7 @@ static int aliasing_gtt_bind_vma(struct i915_vma *vma, /* Currently applicable only to VLV */ pte_flags = 0; - if (vma->obj->gt_ro) + if (i915_gem_object_is_readonly(vma->obj)) pte_flags |= PTE_READ_ONLY; if (flags & I915_VMA_LOCAL_BIND) { diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h index c30d8f80818587..39cfe04dcdb8bb 100644 --- a/drivers/gpu/drm/i915/i915_gem_object.h +++ b/drivers/gpu/drm/i915/i915_gem_object.h @@ -140,7 +140,6 @@ struct drm_i915_gem_object { * Is the object to be mapped as read-only to the GPU * Only honoured if hardware has relevant pte bit */ - unsigned long gt_ro:1; unsigned int cache_level:3; unsigned int cache_coherent:2; #define I915_BO_CACHE_COHERENT_FOR_READ BIT(0) @@ -313,6 +312,18 @@ static inline void i915_gem_object_unlock(struct drm_i915_gem_object *obj) reservation_object_unlock(obj->resv); } +static inline void +i915_gem_object_set_readonly(struct drm_i915_gem_object *obj) +{ + obj->base.vma_node.readonly = true; +} + +static inline bool +i915_gem_object_is_readonly(const struct drm_i915_gem_object *obj) +{ + return obj->base.vma_node.readonly; +} + static inline bool i915_gem_object_has_struct_page(const struct drm_i915_gem_object *obj) { diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 19fe2489065a41..63667a5c2c871b 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1373,7 +1373,7 @@ intel_ring_create_vma(struct drm_i915_private *dev_priv, int size) * if supported by the platform's GGTT. */ if (vm->has_read_only) - obj->gt_ro = 1; + i915_gem_object_set_readonly(obj); vma = i915_vma_instance(obj, vm, NULL); if (IS_ERR(vma)) diff --git a/include/drm/drm_vma_manager.h b/include/drm/drm_vma_manager.h index d84d52f6d2b119..b54c98f05460b9 100644 --- a/include/drm/drm_vma_manager.h +++ b/include/drm/drm_vma_manager.h @@ -41,6 +41,7 @@ struct drm_vma_offset_node { rwlock_t vm_lock; struct drm_mm_node vm_node; struct rb_root vm_files; + bool readonly:1; }; struct drm_vma_offset_manager { From f4a8cc1331d00ba40ca3160d2f6e580511d5e7e4 Mon Sep 17 00:00:00 2001 From: Michal Srb Date: Mon, 5 Feb 2018 16:04:37 +0000 Subject: [PATCH 111/155] drm/i915/cmdparser: Check reg_table_count before derefencing. commit b18224e95cb13ef7517aa26e6b47c85117327f11 upstream. The find_reg function was assuming that there is always at least one table in reg_tables. It is not always true. In case of VCS or VECS, the reg_tables is NULL and reg_table_count is 0, implying that no register-accessing commands are allowed. However, the command tables include commands such as MI_STORE_REGISTER_MEM. When trying to check such command, the find_reg would dereference NULL pointer. Now it will just return NULL meaning that the register was not found and the command will be rejected. Fixes: 76ff480ec963 ("drm/i915/cmdparser: Use binary search for faster register lookup") Signed-off-by: Michal Srb Link: https://patchwork.freedesktop.org/patch/msgid/20180205142916.27092-2-msrb@suse.com Cc: Chris Wilson Cc: Matthew Auld Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20180205160438.3267-1-chris@chris-wilson.co.uk register lookup") Signed-off-by: Rodrigo Vivi Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_cmd_parser.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 8ba932b22f7c80..de7ec59433d11a 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -1038,7 +1038,7 @@ find_reg(const struct intel_engine_cs *engine, bool is_master, u32 addr) const struct drm_i915_reg_table *table = engine->reg_tables; int count = engine->reg_table_count; - do { + for (; count > 0; ++table, --count) { if (!table->master || is_master) { const struct drm_i915_reg_descriptor *reg; @@ -1046,7 +1046,7 @@ find_reg(const struct intel_engine_cs *engine, bool is_master, u32 addr) if (reg != NULL) return reg; } - } while (table++, --count); + } return NULL; } From 4ca946ea4e12c27284290dce6a222e91bca67aee Mon Sep 17 00:00:00 2001 From: Michal Srb Date: Mon, 5 Feb 2018 16:04:38 +0000 Subject: [PATCH 112/155] drm/i915/cmdparser: Do not check past the cmd length. commit b3ad99ed45917f42884fee731fa3cf9b8229a26c upstream. The command MEDIA_VFE_STATE checks bits at offset +2 dwords. However, it is possible to have MEDIA_VFE_STATE command with length = 0 + LENGTH_BIAS = 2. In that case check_cmd will read bits from the following command, or even past the end of the buffer. If the offset ends up outside of the command length, reject the command. Fixes: 351e3db2b363 ("drm/i915: Implement command buffer parsing logic") Signed-off-by: Michal Srb Link: https://patchwork.freedesktop.org/patch/msgid/20180205151745.29292-1-msrb@suse.com Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20180205160438.3267-2-chris@chris-wilson.co.uk Signed-off-by: Rodrigo Vivi Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_cmd_parser.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index de7ec59433d11a..ef7ad016d67c27 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -1218,6 +1218,12 @@ static bool check_cmd(const struct intel_engine_cs *engine, continue; } + if (desc->bits[i].offset >= length) { + DRM_DEBUG_DRIVER("CMD: Rejected command 0x%08X, too short to check bitmask (%s)\n", + *cmd, engine->name); + return false; + } + dword = cmd[desc->bits[i].offset] & desc->bits[i].mask; From 738878ada16538c664a3ae032474fdd845765249 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 7 Nov 2017 15:40:55 +0000 Subject: [PATCH 113/155] drm/i915: Silence smatch for cmdparser commit 0ffba1fc98e8ec35caae8d50b657296ebb9a9a51 upstream. drivers/gpu/drm/i915/i915_cmd_parser.c:808:23: error: not an lvalue drivers/gpu/drm/i915/i915_cmd_parser.c:811:23: error: not an lvalue drivers/gpu/drm/i915/i915_cmd_parser.c:814:23: error: not an lvalue drivers/gpu/drm/i915/i915_cmd_parser.c:808:23: error: not an lvalue drivers/gpu/drm/i915/i915_cmd_parser.c:811:23: error: not an lvalue drivers/gpu/drm/i915/i915_cmd_parser.c:814:23: error: not an lvalue drivers/gpu/drm/i915/i915_cmd_parser.c:808:23: error: not an lvalue drivers/gpu/drm/i915/i915_cmd_parser.c:811:23: error: not an lvalue drivers/gpu/drm/i915/i915_cmd_parser.c:814:23: error: not an lvalue drivers/gpu/drm/i915/i915_cmd_parser.c:808:23: error: not an lvalue drivers/gpu/drm/i915/i915_cmd_parser.c:811:23: error: not an lvalue drivers/gpu/drm/i915/i915_cmd_parser.c:814:23: error: not an lvalue If we move the shift into each case not only do we kill the warning from smatch, but we shrink the code slightly: text data bss dec hex filename 1267906 20587 3168 1291661 13b58d before 1267890 20587 3168 1291645 13b57d after Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: Mika Kuoppala Cc: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20171107154055.19460-1-chris@chris-wilson.co.uk Reviewed-by: Matthew Auld Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_cmd_parser.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index ef7ad016d67c27..d0a9f8d416b25c 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -798,22 +798,15 @@ struct cmd_node { */ static inline u32 cmd_header_key(u32 x) { - u32 shift; - switch (x >> INSTR_CLIENT_SHIFT) { default: case INSTR_MI_CLIENT: - shift = STD_MI_OPCODE_SHIFT; - break; + return x >> STD_MI_OPCODE_SHIFT; case INSTR_RC_CLIENT: - shift = STD_3D_OPCODE_SHIFT; - break; + return x >> STD_3D_OPCODE_SHIFT; case INSTR_BC_CLIENT: - shift = STD_2D_OPCODE_SHIFT; - break; + return x >> STD_2D_OPCODE_SHIFT; } - - return x >> shift; } static int init_hash_table(struct intel_engine_cs *engine, From 26f26f603809180cbf3f9f83e20e4e734d180f42 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sat, 26 Aug 2017 14:56:20 +0100 Subject: [PATCH 114/155] drm/i915: Don't use GPU relocations prior to cmdparser stalls commit 3dbf26ed7b9b40a8cb008ab9ad25703363af815d upstream. If we are using the cmdparser, we will have to copy the batch and so stall for the relocations. Rather than prolong that stall by adding more relocation requests, just use CPU relocations and do the stall upfront. Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20170826135620.25949-1-chris@chris-wilson.co.uk Reviewed-by: Joonas Lahtinen Signed-off-by: Rodrigo Vivi Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 4cc9ce4b5b1648..d3b005b139915e 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -268,6 +268,11 @@ static inline u64 gen8_noncanonical_addr(u64 address) return address & GENMASK_ULL(GEN8_HIGH_ADDRESS_BIT, 0); } +static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) +{ + return eb->engine->needs_cmd_parser && eb->batch_len; +} + static int eb_create(struct i915_execbuffer *eb) { if (!(eb->args->flags & I915_EXEC_HANDLE_LUT)) { @@ -1165,6 +1170,10 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb, if (unlikely(!cache->rq)) { int err; + /* If we need to copy for the cmdparser, we will stall anyway */ + if (eb_use_cmdparser(eb)) + return ERR_PTR(-EWOULDBLOCK); + err = __reloc_gpu_alloc(eb, vma, len); if (unlikely(err)) return ERR_PTR(err); @@ -2305,7 +2314,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, goto err_vma; } - if (eb.engine->needs_cmd_parser && eb.batch_len) { + if (eb_use_cmdparser(&eb)) { struct i915_vma *vma; vma = eb_parse(&eb, drm_is_current_master(file)); From 8d2541e95f5b320cc401a59ef6af6786f61f8cb1 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Wed, 29 Nov 2017 08:24:09 +0000 Subject: [PATCH 115/155] drm/i915: Move engine->needs_cmd_parser to engine->flags commit 439e2ee4ca520e72870e4fa44aa0076060ad6857 upstream. Will be adding a new per-engine flags shortly so it makes sense to consolidate. v2: Keep the original code flow in intel_engine_cleanup_cmd_parser. (Joonas Lahtinen) Signed-off-by: Tvrtko Ursulin Suggested-by: Chris Wilson Reviewed-by: Chris Wilson Reviewed-by: Sagar Arun Kamble Reviewed-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20171129082409.18189-1-tvrtko.ursulin@linux.intel.com Signed-off-by: Rodrigo Vivi Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_cmd_parser.c | 7 ++++--- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +- drivers/gpu/drm/i915/intel_ringbuffer.h | 8 +++++++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index d0a9f8d416b25c..95478db9998b51 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -26,6 +26,7 @@ */ #include "i915_drv.h" +#include "intel_ringbuffer.h" /** * DOC: batch buffer command parser @@ -940,7 +941,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) return; } - engine->needs_cmd_parser = true; + engine->flags |= I915_ENGINE_NEEDS_CMD_PARSER; } /** @@ -952,7 +953,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) */ void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine) { - if (!engine->needs_cmd_parser) + if (!intel_engine_needs_cmd_parser(engine)) return; fini_hash_table(engine); @@ -1356,7 +1357,7 @@ int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv) /* If the command parser is not enabled, report 0 - unsupported */ for_each_engine(engine, dev_priv, id) { - if (engine->needs_cmd_parser) { + if (intel_engine_needs_cmd_parser(engine)) { active = true; break; } diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index d3b005b139915e..b5dd25b12011d4 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -270,7 +270,7 @@ static inline u64 gen8_noncanonical_addr(u64 address) static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) { - return eb->engine->needs_cmd_parser && eb->batch_len; + return intel_engine_needs_cmd_parser(eb->engine) && eb->batch_len; } static int eb_create(struct i915_execbuffer *eb) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 6b2067f1082402..1a00a4e2732b02 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -417,7 +417,8 @@ struct intel_engine_cs { struct intel_engine_hangcheck hangcheck; - bool needs_cmd_parser; +#define I915_ENGINE_NEEDS_CMD_PARSER BIT(0) + unsigned int flags; /* * Table of commands the command parser needs to know about @@ -444,6 +445,11 @@ struct intel_engine_cs { u32 (*get_cmd_length_mask)(u32 cmd_header); }; +static inline bool intel_engine_needs_cmd_parser(struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_NEEDS_CMD_PARSER; +} + static inline unsigned int intel_engine_flag(const struct intel_engine_cs *engine) { From 82e0caec38b8eb2ec4fa90b614ba02fc297faeb7 Mon Sep 17 00:00:00 2001 From: Jon Bloomfield Date: Fri, 20 Apr 2018 14:26:01 -0700 Subject: [PATCH 116/155] drm/i915: Rename gen7 cmdparser tables commit 0a2f661b6c21815a7fa60e30babe975fee8e73c6 upstream. We're about to introduce some new tables for later gens, and the current naming for the gen7 tables will no longer make sense. Signed-off-by: Jon Bloomfield Cc: Tony Luck Cc: Dave Airlie Cc: Takashi Iwai Cc: Tyler Hicks Signed-off-by: Mika Kuoppala Reviewed-by: Chris Wilson Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_cmd_parser.c | 70 +++++++++++++------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 95478db9998b51..452c45ad89ce5e 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -211,7 +211,7 @@ struct drm_i915_cmd_table { /* Command Mask Fixed Len Action ---------------------------------------------------------- */ -static const struct drm_i915_cmd_descriptor common_cmds[] = { +static const struct drm_i915_cmd_descriptor gen7_common_cmds[] = { CMD( MI_NOOP, SMI, F, 1, S ), CMD( MI_USER_INTERRUPT, SMI, F, 1, R ), CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, M ), @@ -244,7 +244,7 @@ static const struct drm_i915_cmd_descriptor common_cmds[] = { CMD( MI_BATCH_BUFFER_START, SMI, !F, 0xFF, S ), }; -static const struct drm_i915_cmd_descriptor render_cmds[] = { +static const struct drm_i915_cmd_descriptor gen7_render_cmds[] = { CMD( MI_FLUSH, SMI, F, 1, S ), CMD( MI_ARB_ON_OFF, SMI, F, 1, R ), CMD( MI_PREDICATE, SMI, F, 1, S ), @@ -328,7 +328,7 @@ static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = { CMD( GFX_OP_3DSTATE_BINDING_TABLE_EDIT_PS, S3D, !F, 0x1FF, S ), }; -static const struct drm_i915_cmd_descriptor video_cmds[] = { +static const struct drm_i915_cmd_descriptor gen7_video_cmds[] = { CMD( MI_ARB_ON_OFF, SMI, F, 1, R ), CMD( MI_SET_APPID, SMI, F, 1, S ), CMD( MI_STORE_DWORD_IMM, SMI, !F, 0xFF, B, @@ -372,7 +372,7 @@ static const struct drm_i915_cmd_descriptor video_cmds[] = { CMD( MFX_WAIT, SMFX, F, 1, S ), }; -static const struct drm_i915_cmd_descriptor vecs_cmds[] = { +static const struct drm_i915_cmd_descriptor gen7_vecs_cmds[] = { CMD( MI_ARB_ON_OFF, SMI, F, 1, R ), CMD( MI_SET_APPID, SMI, F, 1, S ), CMD( MI_STORE_DWORD_IMM, SMI, !F, 0xFF, B, @@ -410,7 +410,7 @@ static const struct drm_i915_cmd_descriptor vecs_cmds[] = { }}, ), }; -static const struct drm_i915_cmd_descriptor blt_cmds[] = { +static const struct drm_i915_cmd_descriptor gen7_blt_cmds[] = { CMD( MI_DISPLAY_FLIP, SMI, !F, 0xFF, R ), CMD( MI_STORE_DWORD_IMM, SMI, !F, 0x3FF, B, .bits = {{ @@ -463,35 +463,35 @@ static const struct drm_i915_cmd_descriptor noop_desc = #undef B #undef M -static const struct drm_i915_cmd_table gen7_render_cmds[] = { - { common_cmds, ARRAY_SIZE(common_cmds) }, - { render_cmds, ARRAY_SIZE(render_cmds) }, +static const struct drm_i915_cmd_table gen7_render_cmd_table[] = { + { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, + { gen7_render_cmds, ARRAY_SIZE(gen7_render_cmds) }, }; -static const struct drm_i915_cmd_table hsw_render_ring_cmds[] = { - { common_cmds, ARRAY_SIZE(common_cmds) }, - { render_cmds, ARRAY_SIZE(render_cmds) }, +static const struct drm_i915_cmd_table hsw_render_ring_cmd_table[] = { + { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, + { gen7_render_cmds, ARRAY_SIZE(gen7_render_cmds) }, { hsw_render_cmds, ARRAY_SIZE(hsw_render_cmds) }, }; -static const struct drm_i915_cmd_table gen7_video_cmds[] = { - { common_cmds, ARRAY_SIZE(common_cmds) }, - { video_cmds, ARRAY_SIZE(video_cmds) }, +static const struct drm_i915_cmd_table gen7_video_cmd_table[] = { + { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, + { gen7_video_cmds, ARRAY_SIZE(gen7_video_cmds) }, }; -static const struct drm_i915_cmd_table hsw_vebox_cmds[] = { - { common_cmds, ARRAY_SIZE(common_cmds) }, - { vecs_cmds, ARRAY_SIZE(vecs_cmds) }, +static const struct drm_i915_cmd_table hsw_vebox_cmd_table[] = { + { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, + { gen7_vecs_cmds, ARRAY_SIZE(gen7_vecs_cmds) }, }; -static const struct drm_i915_cmd_table gen7_blt_cmds[] = { - { common_cmds, ARRAY_SIZE(common_cmds) }, - { blt_cmds, ARRAY_SIZE(blt_cmds) }, +static const struct drm_i915_cmd_table gen7_blt_cmd_table[] = { + { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, + { gen7_blt_cmds, ARRAY_SIZE(gen7_blt_cmds) }, }; -static const struct drm_i915_cmd_table hsw_blt_ring_cmds[] = { - { common_cmds, ARRAY_SIZE(common_cmds) }, - { blt_cmds, ARRAY_SIZE(blt_cmds) }, +static const struct drm_i915_cmd_table hsw_blt_ring_cmd_table[] = { + { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, + { gen7_blt_cmds, ARRAY_SIZE(gen7_blt_cmds) }, { hsw_blt_cmds, ARRAY_SIZE(hsw_blt_cmds) }, }; @@ -871,12 +871,12 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) switch (engine->id) { case RCS: if (IS_HASWELL(engine->i915)) { - cmd_tables = hsw_render_ring_cmds; + cmd_tables = hsw_render_ring_cmd_table; cmd_table_count = - ARRAY_SIZE(hsw_render_ring_cmds); + ARRAY_SIZE(hsw_render_ring_cmd_table); } else { - cmd_tables = gen7_render_cmds; - cmd_table_count = ARRAY_SIZE(gen7_render_cmds); + cmd_tables = gen7_render_cmd_table; + cmd_table_count = ARRAY_SIZE(gen7_render_cmd_table); } if (IS_HASWELL(engine->i915)) { @@ -890,17 +890,17 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) engine->get_cmd_length_mask = gen7_render_get_cmd_length_mask; break; case VCS: - cmd_tables = gen7_video_cmds; - cmd_table_count = ARRAY_SIZE(gen7_video_cmds); + cmd_tables = gen7_video_cmd_table; + cmd_table_count = ARRAY_SIZE(gen7_video_cmd_table); engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; break; case BCS: if (IS_HASWELL(engine->i915)) { - cmd_tables = hsw_blt_ring_cmds; - cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); + cmd_tables = hsw_blt_ring_cmd_table; + cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmd_table); } else { - cmd_tables = gen7_blt_cmds; - cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); + cmd_tables = gen7_blt_cmd_table; + cmd_table_count = ARRAY_SIZE(gen7_blt_cmd_table); } if (IS_HASWELL(engine->i915)) { @@ -914,8 +914,8 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; break; case VECS: - cmd_tables = hsw_vebox_cmds; - cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); + cmd_tables = hsw_vebox_cmd_table; + cmd_table_count = ARRAY_SIZE(hsw_vebox_cmd_table); /* VECS can use the same length_mask function as VCS */ engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; break; From 771ca35572723782de8a4654d45397b1486d5053 Mon Sep 17 00:00:00 2001 From: Jon Bloomfield Date: Fri, 8 Jun 2018 08:53:46 -0700 Subject: [PATCH 117/155] drm/i915: Disable Secure Batches for gen6+ commit 44157641d448cbc0c4b73c5231d2b911f0cb0427 upstream. Retroactively stop reporting support for secure batches through the api for gen6+ so that older binaries trigger the fallback path instead. Older binaries use secure batches pre gen6 to access resources that are not available to normal usermode processes. However, all known userspace explicitly checks for HAS_SECURE_BATCHES before relying on the secure batch feature. Since there are no known binaries relying on this for newer gens we can kill secure batches from gen6, via I915_PARAM_HAS_SECURE_BATCHES. v2: rebase (Mika) v3: rebase (Mika) Signed-off-by: Jon Bloomfield Cc: Tony Luck Cc: Dave Airlie Cc: Takashi Iwai Cc: Tyler Hicks Signed-off-by: Mika Kuoppala Reviewed-by: Chris Wilson Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_drv.c | 2 +- drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_gem_execbuffer.c | 12 ++++++++++-- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index c75f4ccbcdefe6..9b642dd040cfd7 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -323,7 +323,7 @@ static int i915_getparam(struct drm_device *dev, void *data, value = i915.semaphores; break; case I915_PARAM_HAS_SECURE_BATCHES: - value = capable(CAP_SYS_ADMIN); + value = HAS_SECURE_BATCHES(dev_priv) && capable(CAP_SYS_ADMIN); break; case I915_PARAM_CMD_PARSER_VERSION: value = i915_cmd_parser_get_version(dev_priv); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 41f51509c9e422..e888bf5ee31d29 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2996,6 +2996,8 @@ intel_info(const struct drm_i915_private *dev_priv) #define HAS_BLT(dev_priv) HAS_ENGINE(dev_priv, BCS) #define HAS_VEBOX(dev_priv) HAS_ENGINE(dev_priv, VECS) +#define HAS_SECURE_BATCHES(dev_priv) (INTEL_GEN(dev_priv) < 6) + #define HAS_LLC(dev_priv) ((dev_priv)->info.has_llc) #define HAS_SNOOP(dev_priv) ((dev_priv)->info.has_snoop) #define HAS_EDRAM(dev_priv) (!!((dev_priv)->edram_cap & EDRAM_ENABLED)) diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index b5dd25b12011d4..fb6913fb2af0c8 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -2195,6 +2195,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, struct drm_i915_gem_exec_object2 *exec, struct drm_syncobj **fences) { + struct drm_i915_private *dev_priv = to_i915(dev); struct i915_execbuffer eb; struct dma_fence *in_fence = NULL; struct sync_file *out_fence = NULL; @@ -2204,7 +2205,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & ~__EXEC_OBJECT_UNKNOWN_FLAGS); - eb.i915 = to_i915(dev); + eb.i915 = dev_priv; eb.file = file; eb.args = args; if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC)) @@ -2226,8 +2227,15 @@ i915_gem_do_execbuffer(struct drm_device *dev, eb.batch_flags = 0; if (args->flags & I915_EXEC_SECURE) { + if (INTEL_GEN(dev_priv) >= 11) + return -ENODEV; + + /* Return -EPERM to trigger fallback code on old binaries. */ + if (!HAS_SECURE_BATCHES(dev_priv)) + return -EPERM; + if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN)) - return -EPERM; + return -EPERM; eb.batch_flags |= I915_DISPATCH_SECURE; } From 0b19d8fdfc160a6e700e12dcd49dbdee113a5f8d Mon Sep 17 00:00:00 2001 From: Jon Bloomfield Date: Fri, 8 Jun 2018 10:05:26 -0700 Subject: [PATCH 118/155] drm/i915: Remove Master tables from cmdparser commit 66d8aba1cd6db34af10de465c0d52af679288cb6 upstream. The previous patch has killed support for secure batches on gen6+, and hence the cmdparsers master tables are now dead code. Remove them. Signed-off-by: Jon Bloomfield Cc: Tony Luck Cc: Dave Airlie Cc: Takashi Iwai Cc: Tyler Hicks Reviewed-by: Chris Wilson Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_cmd_parser.c | 84 ++++++---------------- drivers/gpu/drm/i915/i915_drv.h | 3 +- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 7 +- 3 files changed, 26 insertions(+), 68 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 452c45ad89ce5e..59d1432ecbbfc1 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -51,13 +51,11 @@ * granting userspace undue privileges. There are three categories of privilege. * * First, commands which are explicitly defined as privileged or which should - * only be used by the kernel driver. The parser generally rejects such - * commands, though it may allow some from the drm master process. + * only be used by the kernel driver. The parser rejects such commands * * Second, commands which access registers. To support correct/enhanced * userspace functionality, particularly certain OpenGL extensions, the parser - * provides a whitelist of registers which userspace may safely access (for both - * normal and drm master processes). + * provides a whitelist of registers which userspace may safely access * * Third, commands which access privileged memory (i.e. GGTT, HWS page, etc). * The parser always rejects such commands. @@ -82,9 +80,9 @@ * in the per-engine command tables. * * Other command table entries map fairly directly to high level categories - * mentioned above: rejected, master-only, register whitelist. The parser - * implements a number of checks, including the privileged memory checks, via a - * general bitmasking mechanism. + * mentioned above: rejected, register whitelist. The parser implements a number + * of checks, including the privileged memory checks, via a general bitmasking + * mechanism. */ /* @@ -102,8 +100,6 @@ struct drm_i915_cmd_descriptor { * CMD_DESC_REJECT: The command is never allowed * CMD_DESC_REGISTER: The command should be checked against the * register whitelist for the appropriate ring - * CMD_DESC_MASTER: The command is allowed if the submitting process - * is the DRM master */ u32 flags; #define CMD_DESC_FIXED (1<<0) @@ -111,7 +107,6 @@ struct drm_i915_cmd_descriptor { #define CMD_DESC_REJECT (1<<2) #define CMD_DESC_REGISTER (1<<3) #define CMD_DESC_BITMASK (1<<4) -#define CMD_DESC_MASTER (1<<5) /* * The command's unique identification bits and the bitmask to get them. @@ -207,14 +202,13 @@ struct drm_i915_cmd_table { #define R CMD_DESC_REJECT #define W CMD_DESC_REGISTER #define B CMD_DESC_BITMASK -#define M CMD_DESC_MASTER /* Command Mask Fixed Len Action ---------------------------------------------------------- */ static const struct drm_i915_cmd_descriptor gen7_common_cmds[] = { CMD( MI_NOOP, SMI, F, 1, S ), CMD( MI_USER_INTERRUPT, SMI, F, 1, R ), - CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, M ), + CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, R ), CMD( MI_ARB_CHECK, SMI, F, 1, S ), CMD( MI_REPORT_HEAD, SMI, F, 1, S ), CMD( MI_SUSPEND_FLUSH, SMI, F, 1, S ), @@ -311,7 +305,7 @@ static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = { CMD( MI_URB_ATOMIC_ALLOC, SMI, F, 1, S ), CMD( MI_SET_APPID, SMI, F, 1, S ), CMD( MI_RS_CONTEXT, SMI, F, 1, S ), - CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, M ), + CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, R ), CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, R ), CMD( MI_LOAD_REGISTER_REG, SMI, !F, 0xFF, W, .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 } ), @@ -444,7 +438,7 @@ static const struct drm_i915_cmd_descriptor gen7_blt_cmds[] = { }; static const struct drm_i915_cmd_descriptor hsw_blt_cmds[] = { - CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, M ), + CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, R ), CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, R ), }; @@ -461,7 +455,6 @@ static const struct drm_i915_cmd_descriptor noop_desc = #undef R #undef W #undef B -#undef M static const struct drm_i915_cmd_table gen7_render_cmd_table[] = { { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, @@ -610,47 +603,29 @@ static const struct drm_i915_reg_descriptor gen7_blt_regs[] = { REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE), }; -static const struct drm_i915_reg_descriptor ivb_master_regs[] = { - REG32(FORCEWAKE_MT), - REG32(DERRMR), - REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_A)), - REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_B)), - REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_C)), -}; - -static const struct drm_i915_reg_descriptor hsw_master_regs[] = { - REG32(FORCEWAKE_MT), - REG32(DERRMR), -}; - #undef REG64 #undef REG32 struct drm_i915_reg_table { const struct drm_i915_reg_descriptor *regs; int num_regs; - bool master; }; static const struct drm_i915_reg_table ivb_render_reg_tables[] = { - { gen7_render_regs, ARRAY_SIZE(gen7_render_regs), false }, - { ivb_master_regs, ARRAY_SIZE(ivb_master_regs), true }, + { gen7_render_regs, ARRAY_SIZE(gen7_render_regs) }, }; static const struct drm_i915_reg_table ivb_blt_reg_tables[] = { - { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs), false }, - { ivb_master_regs, ARRAY_SIZE(ivb_master_regs), true }, + { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) }, }; static const struct drm_i915_reg_table hsw_render_reg_tables[] = { - { gen7_render_regs, ARRAY_SIZE(gen7_render_regs), false }, - { hsw_render_regs, ARRAY_SIZE(hsw_render_regs), false }, - { hsw_master_regs, ARRAY_SIZE(hsw_master_regs), true }, + { gen7_render_regs, ARRAY_SIZE(gen7_render_regs) }, + { hsw_render_regs, ARRAY_SIZE(hsw_render_regs) }, }; static const struct drm_i915_reg_table hsw_blt_reg_tables[] = { - { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs), false }, - { hsw_master_regs, ARRAY_SIZE(hsw_master_regs), true }, + { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) }, }; static u32 gen7_render_get_cmd_length_mask(u32 cmd_header) @@ -1027,22 +1002,16 @@ __find_reg(const struct drm_i915_reg_descriptor *table, int count, u32 addr) } static const struct drm_i915_reg_descriptor * -find_reg(const struct intel_engine_cs *engine, bool is_master, u32 addr) +find_reg(const struct intel_engine_cs *engine, u32 addr) { const struct drm_i915_reg_table *table = engine->reg_tables; + const struct drm_i915_reg_descriptor *reg = NULL; int count = engine->reg_table_count; - for (; count > 0; ++table, --count) { - if (!table->master || is_master) { - const struct drm_i915_reg_descriptor *reg; + for (; !reg && (count > 0); ++table, --count) + reg = __find_reg(table->regs, table->num_regs, addr); - reg = __find_reg(table->regs, table->num_regs, addr); - if (reg != NULL) - return reg; - } - } - - return NULL; + return reg; } /* Returns a vmap'd pointer to dst_obj, which the caller must unmap */ @@ -1127,8 +1096,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj, static bool check_cmd(const struct intel_engine_cs *engine, const struct drm_i915_cmd_descriptor *desc, - const u32 *cmd, u32 length, - const bool is_master) + const u32 *cmd, u32 length) { if (desc->flags & CMD_DESC_SKIP) return true; @@ -1138,12 +1106,6 @@ static bool check_cmd(const struct intel_engine_cs *engine, return false; } - if ((desc->flags & CMD_DESC_MASTER) && !is_master) { - DRM_DEBUG_DRIVER("CMD: Rejected master-only command: 0x%08X\n", - *cmd); - return false; - } - if (desc->flags & CMD_DESC_REGISTER) { /* * Get the distance between individual register offset @@ -1157,7 +1119,7 @@ static bool check_cmd(const struct intel_engine_cs *engine, offset += step) { const u32 reg_addr = cmd[offset] & desc->reg.mask; const struct drm_i915_reg_descriptor *reg = - find_reg(engine, is_master, reg_addr); + find_reg(engine, reg_addr); if (!reg) { DRM_DEBUG_DRIVER("CMD: Rejected register 0x%08X in command: 0x%08X (%s)\n", @@ -1244,7 +1206,6 @@ static bool check_cmd(const struct intel_engine_cs *engine, * @shadow_batch_obj: copy of the batch buffer in question * @batch_start_offset: byte offset in the batch at which execution starts * @batch_len: length of the commands in batch_obj - * @is_master: is the submitting process the drm master? * * Parses the specified batch buffer looking for privilege violations as * described in the overview. @@ -1256,8 +1217,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, struct drm_i915_gem_object *batch_obj, struct drm_i915_gem_object *shadow_batch_obj, u32 batch_start_offset, - u32 batch_len, - bool is_master) + u32 batch_len) { u32 *cmd, *batch_end; struct drm_i915_cmd_descriptor default_desc = noop_desc; @@ -1323,7 +1283,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, break; } - if (!check_cmd(engine, desc, cmd, length, is_master)) { + if (!check_cmd(engine, desc, cmd, length)) { ret = -EACCES; break; } diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e888bf5ee31d29..93d22f249b5b63 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3847,8 +3847,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, struct drm_i915_gem_object *batch_obj, struct drm_i915_gem_object *shadow_batch_obj, u32 batch_start_offset, - u32 batch_len, - bool is_master); + u32 batch_len); /* i915_perf.c */ extern void i915_perf_init(struct drm_i915_private *dev_priv); diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index fb6913fb2af0c8..a4487af0365978 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1911,7 +1911,7 @@ static int i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req) return 0; } -static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master) +static struct i915_vma *eb_parse(struct i915_execbuffer *eb) { struct drm_i915_gem_object *shadow_batch_obj; struct i915_vma *vma; @@ -1926,8 +1926,7 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master) eb->batch->obj, shadow_batch_obj, eb->batch_start_offset, - eb->batch_len, - is_master); + eb->batch_len); if (err) { if (err == -EACCES) /* unhandled chained batch */ vma = NULL; @@ -2325,7 +2324,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, if (eb_use_cmdparser(&eb)) { struct i915_vma *vma; - vma = eb_parse(&eb, drm_is_current_master(file)); + vma = eb_parse(&eb); if (IS_ERR(vma)) { err = PTR_ERR(vma); goto err_vma; From 397944fc408dcd47ed5224d4cb1b0a7088299117 Mon Sep 17 00:00:00 2001 From: Jon Bloomfield Date: Wed, 1 Aug 2018 09:33:59 -0700 Subject: [PATCH 119/155] drm/i915: Add support for mandatory cmdparsing commit 311a50e76a33d1e029563c24b2ff6db0c02b5afe upstream. The existing cmdparser for gen7 can be bypassed by specifying batch_len=0 in the execbuf call. This is safe because bypassing simply reduces the cmd-set available. In a later patch we will introduce cmdparsing for gen9, as a security measure, which must be strictly enforced since without it we are vulnerable to DoS attacks. Introduce the concept of 'required' cmd parsing that cannot be bypassed by submitting zero-length bb's. v2: rebase (Mika) v2: rebase (Mika) v3: fix conflict on engine flags (Mika) Signed-off-by: Jon Bloomfield Cc: Joonas Lahtinen Cc: Rodrigo Vivi Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_cmd_parser.c | 6 +++--- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 3 ++- drivers/gpu/drm/i915/intel_ringbuffer.h | 14 +++++++++++--- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 59d1432ecbbfc1..5fa6a68e03cdc8 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -916,7 +916,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) return; } - engine->flags |= I915_ENGINE_NEEDS_CMD_PARSER; + engine->flags |= I915_ENGINE_USING_CMD_PARSER; } /** @@ -928,7 +928,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) */ void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine) { - if (!intel_engine_needs_cmd_parser(engine)) + if (!intel_engine_using_cmd_parser(engine)) return; fini_hash_table(engine); @@ -1317,7 +1317,7 @@ int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv) /* If the command parser is not enabled, report 0 - unsupported */ for_each_engine(engine, dev_priv, id) { - if (intel_engine_needs_cmd_parser(engine)) { + if (intel_engine_using_cmd_parser(engine)) { active = true; break; } diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index a4487af0365978..e72dc30e56c7b8 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -270,7 +270,8 @@ static inline u64 gen8_noncanonical_addr(u64 address) static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) { - return intel_engine_needs_cmd_parser(eb->engine) && eb->batch_len; + return intel_engine_requires_cmd_parser(eb->engine) || + (intel_engine_using_cmd_parser(eb->engine) && eb->batch_len); } static int eb_create(struct i915_execbuffer *eb) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 1a00a4e2732b02..774e3772d0eda8 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -417,7 +417,8 @@ struct intel_engine_cs { struct intel_engine_hangcheck hangcheck; -#define I915_ENGINE_NEEDS_CMD_PARSER BIT(0) +#define I915_ENGINE_USING_CMD_PARSER BIT(0) +#define I915_ENGINE_REQUIRES_CMD_PARSER BIT(3) unsigned int flags; /* @@ -445,9 +446,16 @@ struct intel_engine_cs { u32 (*get_cmd_length_mask)(u32 cmd_header); }; -static inline bool intel_engine_needs_cmd_parser(struct intel_engine_cs *engine) +static inline bool +intel_engine_using_cmd_parser(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_USING_CMD_PARSER; +} + +static inline bool +intel_engine_requires_cmd_parser(const struct intel_engine_cs *engine) { - return engine->flags & I915_ENGINE_NEEDS_CMD_PARSER; + return engine->flags & I915_ENGINE_REQUIRES_CMD_PARSER; } static inline unsigned int From 78eff893fa30e8fa1ca0cb938b0c8bdf0544b973 Mon Sep 17 00:00:00 2001 From: Jon Bloomfield Date: Tue, 22 May 2018 13:59:06 -0700 Subject: [PATCH 120/155] drm/i915: Support ro ppgtt mapped cmdparser shadow buffers commit 4f7af1948abcb18b4772fe1bcd84d7d27d96258c upstream. For Gen7, the original cmdparser motive was to permit limited use of register read/write instructions in unprivileged BB's. This worked by copying the user supplied bb to a kmd owned bb, and running it in secure mode, from the ggtt, only if the scanner finds no unsafe commands or registers. For Gen8+ we can't use this same technique because running bb's from the ggtt also disables access to ppgtt space. But we also do not actually require 'secure' execution since we are only trying to reduce the available command/register set. Instead we will copy the user buffer to a kmd owned read-only bb in ppgtt, and run in the usual non-secure mode. Note that ro pages are only supported by ppgtt (not ggtt), but luckily that's exactly what we need. Add the required paths to map the shadow buffer to ppgtt ro for Gen8+ v2: IS_GEN7/IS_GEN (Mika) v3: rebase v4: rebase v5: rebase Signed-off-by: Jon Bloomfield Cc: Tony Luck Cc: Dave Airlie Cc: Takashi Iwai Cc: Tyler Hicks Signed-off-by: Mika Kuoppala Reviewed-by: Chris Wilson Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_drv.h | 14 ++++++ drivers/gpu/drm/i915/i915_gem.c | 16 ++++++- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 56 +++++++++++++++------- 3 files changed, 68 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 93d22f249b5b63..e50030c606656f 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2980,6 +2980,12 @@ intel_info(const struct drm_i915_private *dev_priv) #define IS_GEN9_LP(dev_priv) (IS_GEN9(dev_priv) && IS_LP(dev_priv)) #define IS_GEN9_BC(dev_priv) (IS_GEN9(dev_priv) && !IS_LP(dev_priv)) +/* + * The Gen7 cmdparser copies the scanned buffer to the ggtt for execution + * All later gens can run the final buffer from the ppgtt + */ +#define CMDPARSER_USES_GGTT(dev_priv) IS_GEN7(dev_priv) + #define ENGINE_MASK(id) BIT(id) #define RENDER_RING ENGINE_MASK(RCS) #define BSD_RING ENGINE_MASK(VCS) @@ -3393,6 +3399,14 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj, u64 alignment, u64 flags); +struct i915_vma * __must_check +i915_gem_object_pin(struct drm_i915_gem_object *obj, + struct i915_address_space *vm, + const struct i915_ggtt_view *view, + u64 size, + u64 alignment, + u64 flags); + int i915_gem_object_unbind(struct drm_i915_gem_object *obj); void i915_gem_release_mmap(struct drm_i915_gem_object *obj); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index f95e2c8ac239a7..785d192b27c804 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -4000,6 +4000,20 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj, { struct drm_i915_private *dev_priv = to_i915(obj->base.dev); struct i915_address_space *vm = &dev_priv->ggtt.base; + + return i915_gem_object_pin(obj, vm, view, size, alignment, + flags | PIN_GLOBAL); +} + +struct i915_vma * +i915_gem_object_pin(struct drm_i915_gem_object *obj, + struct i915_address_space *vm, + const struct i915_ggtt_view *view, + u64 size, + u64 alignment, + u64 flags) +{ + struct drm_i915_private *dev_priv = to_i915(obj->base.dev); struct i915_vma *vma; int ret; @@ -4057,7 +4071,7 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj, return ERR_PTR(ret); } - ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL); + ret = i915_vma_pin(vma, size, alignment, flags); if (ret) return ERR_PTR(ret); diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index e72dc30e56c7b8..2cd38ec7914141 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1912,6 +1912,33 @@ static int i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req) return 0; } +static struct i915_vma * +shadow_batch_pin(struct i915_execbuffer *eb, struct drm_i915_gem_object *obj) +{ + struct drm_i915_private *dev_priv = eb->i915; + struct i915_address_space *vm; + u64 flags; + + /* + * PPGTT backed shadow buffers must be mapped RO, to prevent + * post-scan tampering + */ + if (CMDPARSER_USES_GGTT(dev_priv)) { + flags = PIN_GLOBAL; + vm = &dev_priv->ggtt.base; + eb->batch_flags |= I915_DISPATCH_SECURE; + } else if (eb->vm->has_read_only) { + flags = PIN_USER; + vm = eb->vm; + i915_gem_object_set_readonly(obj); + } else { + DRM_DEBUG("Cannot prevent post-scan tampering without RO capable vm\n"); + return ERR_PTR(-EINVAL); + } + + return i915_gem_object_pin(obj, vm, NULL, 0, 0, flags); +} + static struct i915_vma *eb_parse(struct i915_execbuffer *eb) { struct drm_i915_gem_object *shadow_batch_obj; @@ -1929,14 +1956,21 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb) eb->batch_start_offset, eb->batch_len); if (err) { - if (err == -EACCES) /* unhandled chained batch */ + /* + * Unsafe GGTT-backed buffers can still be submitted safely + * as non-secure. + * For PPGTT backing however, we have no choice but to forcibly + * reject unsafe buffers + */ + if (CMDPARSER_USES_GGTT(eb->i915) && (err == -EACCES)) + /* Execute original buffer non-secure */ vma = NULL; else vma = ERR_PTR(err); goto out; } - vma = i915_gem_object_ggtt_pin(shadow_batch_obj, NULL, 0, 0, 0); + vma = shadow_batch_pin(eb, shadow_batch_obj); if (IS_ERR(vma)) goto out; @@ -1945,6 +1979,9 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb) __EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF; vma->exec_flags = &eb->flags[eb->buffer_count]; eb->buffer_count++; + eb->batch_start_offset = 0; + eb->batch = vma; + /* eb->batch_len unchanged */ out: i915_gem_object_unpin_pages(shadow_batch_obj); @@ -2330,21 +2367,6 @@ i915_gem_do_execbuffer(struct drm_device *dev, err = PTR_ERR(vma); goto err_vma; } - - if (vma) { - /* - * Batch parsed and accepted: - * - * Set the DISPATCH_SECURE bit to remove the NON_SECURE - * bit from MI_BATCH_BUFFER_START commands issued in - * the dispatch_execbuffer implementations. We - * specifically don't want that set on batches the - * command parser has accepted. - */ - eb.batch_flags |= I915_DISPATCH_SECURE; - eb.batch_start_offset = 0; - eb.batch = vma; - } } if (eb.batch_len == 0) From d8573c4fb7af86c888faeba98f1ac482b1872ce2 Mon Sep 17 00:00:00 2001 From: Jon Bloomfield Date: Wed, 1 Aug 2018 09:45:50 -0700 Subject: [PATCH 121/155] drm/i915: Allow parsing of unsized batches commit 435e8fc059dbe0eec823a75c22da2972390ba9e0 upstream. In "drm/i915: Add support for mandatory cmdparsing" we introduced the concept of mandatory parsing. This allows the cmdparser to be invoked even when user passes batch_len=0 to the execbuf ioctl's. However, the cmdparser needs to know the extents of the buffer being scanned. Refactor the code to ensure the cmdparser uses the actual object size, instead of the incoming length, if user passes 0. Signed-off-by: Jon Bloomfield Cc: Tony Luck Cc: Dave Airlie Cc: Takashi Iwai Cc: Tyler Hicks Reviewed-by: Chris Wilson Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 2cd38ec7914141..12d44ea36d24a7 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -271,7 +271,8 @@ static inline u64 gen8_noncanonical_addr(u64 address) static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) { return intel_engine_requires_cmd_parser(eb->engine) || - (intel_engine_using_cmd_parser(eb->engine) && eb->batch_len); + (intel_engine_using_cmd_parser(eb->engine) && + eb->args->batch_len); } static int eb_create(struct i915_execbuffer *eb) @@ -2359,6 +2360,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, goto err_vma; } + if (eb.batch_len == 0) + eb.batch_len = eb.batch->size - eb.batch_start_offset; + if (eb_use_cmdparser(&eb)) { struct i915_vma *vma; @@ -2369,9 +2373,6 @@ i915_gem_do_execbuffer(struct drm_device *dev, } } - if (eb.batch_len == 0) - eb.batch_len = eb.batch->size - eb.batch_start_offset; - /* * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure * batch" bit. Hence we need to pin secure batches into the global gtt. From b125c2230ceae25b9d51cc1e7d263b97f59e64cb Mon Sep 17 00:00:00 2001 From: Jon Bloomfield Date: Mon, 23 Apr 2018 11:12:15 -0700 Subject: [PATCH 122/155] drm/i915: Add gen9 BCS cmdparsing commit 0f2f39758341df70202ae1c42d5a1e4ee392b6d3 upstream. For gen9 we enable cmdparsing on the BCS ring, specifically to catch inadvertent accesses to sensitive registers Unlike gen7/hsw, we use the parser only to block certain registers. We can rely on h/w to block restricted commands, so the command tables only provide enough info to allow the parser to delineate each command, and identify commands that access registers. Note: This patch deliberately ignores checkpatch issues in favour of matching the style of the surrounding code. We'll correct the entire file in one go in a later patch. v3: rebase (Mika) v4: Add RING_TIMESTAMP registers to whitelist (Jon) Signed-off-by: Jon Bloomfield Cc: Tony Luck Cc: Dave Airlie Cc: Takashi Iwai Cc: Tyler Hicks Signed-off-by: Mika Kuoppala Reviewed-by: Chris Wilson Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_cmd_parser.c | 116 ++++++++++++++++++++++--- drivers/gpu/drm/i915/i915_gem_gtt.c | 3 +- drivers/gpu/drm/i915/i915_reg.h | 4 + 3 files changed, 112 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 5fa6a68e03cdc8..09f1672488bbe3 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -442,6 +442,47 @@ static const struct drm_i915_cmd_descriptor hsw_blt_cmds[] = { CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, R ), }; +/* + * For Gen9 we can still rely on the h/w to enforce cmd security, and only + * need to re-enforce the register access checks. We therefore only need to + * teach the cmdparser how to find the end of each command, and identify + * register accesses. The table doesn't need to reject any commands, and so + * the only commands listed here are: + * 1) Those that touch registers + * 2) Those that do not have the default 8-bit length + * + * Note that the default MI length mask chosen for this table is 0xFF, not + * the 0x3F used on older devices. This is because the vast majority of MI + * cmds on Gen9 use a standard 8-bit Length field. + * All the Gen9 blitter instructions are standard 0xFF length mask, and + * none allow access to non-general registers, so in fact no BLT cmds are + * included in the table at all. + * + */ +static const struct drm_i915_cmd_descriptor gen9_blt_cmds[] = { + CMD( MI_NOOP, SMI, F, 1, S ), + CMD( MI_USER_INTERRUPT, SMI, F, 1, S ), + CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, S ), + CMD( MI_FLUSH, SMI, F, 1, S ), + CMD( MI_ARB_CHECK, SMI, F, 1, S ), + CMD( MI_REPORT_HEAD, SMI, F, 1, S ), + CMD( MI_ARB_ON_OFF, SMI, F, 1, S ), + CMD( MI_SUSPEND_FLUSH, SMI, F, 1, S ), + CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, S ), + CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, S ), + CMD( MI_STORE_DWORD_IMM, SMI, !F, 0x3FF, S ), + CMD( MI_LOAD_REGISTER_IMM(1), SMI, !F, 0xFF, W, + .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 2 } ), + CMD( MI_UPDATE_GTT, SMI, !F, 0x3FF, S ), + CMD( MI_STORE_REGISTER_MEM_GEN8, SMI, F, 4, W, + .reg = { .offset = 1, .mask = 0x007FFFFC } ), + CMD( MI_FLUSH_DW, SMI, !F, 0x3F, S ), + CMD( MI_LOAD_REGISTER_MEM_GEN8, SMI, F, 4, W, + .reg = { .offset = 1, .mask = 0x007FFFFC } ), + CMD( MI_LOAD_REGISTER_REG, SMI, !F, 0xFF, W, + .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 } ), +}; + static const struct drm_i915_cmd_descriptor noop_desc = CMD(MI_NOOP, SMI, F, 1, S); @@ -488,6 +529,11 @@ static const struct drm_i915_cmd_table hsw_blt_ring_cmd_table[] = { { hsw_blt_cmds, ARRAY_SIZE(hsw_blt_cmds) }, }; +static const struct drm_i915_cmd_table gen9_blt_cmd_table[] = { + { gen9_blt_cmds, ARRAY_SIZE(gen9_blt_cmds) }, +}; + + /* * Register whitelists, sorted by increasing register offset. */ @@ -603,6 +649,29 @@ static const struct drm_i915_reg_descriptor gen7_blt_regs[] = { REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE), }; +static const struct drm_i915_reg_descriptor gen9_blt_regs[] = { + REG64_IDX(RING_TIMESTAMP, RENDER_RING_BASE), + REG64_IDX(RING_TIMESTAMP, BSD_RING_BASE), + REG32(BCS_SWCTRL), + REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE), + REG64_IDX(BCS_GPR, 0), + REG64_IDX(BCS_GPR, 1), + REG64_IDX(BCS_GPR, 2), + REG64_IDX(BCS_GPR, 3), + REG64_IDX(BCS_GPR, 4), + REG64_IDX(BCS_GPR, 5), + REG64_IDX(BCS_GPR, 6), + REG64_IDX(BCS_GPR, 7), + REG64_IDX(BCS_GPR, 8), + REG64_IDX(BCS_GPR, 9), + REG64_IDX(BCS_GPR, 10), + REG64_IDX(BCS_GPR, 11), + REG64_IDX(BCS_GPR, 12), + REG64_IDX(BCS_GPR, 13), + REG64_IDX(BCS_GPR, 14), + REG64_IDX(BCS_GPR, 15), +}; + #undef REG64 #undef REG32 @@ -628,6 +697,10 @@ static const struct drm_i915_reg_table hsw_blt_reg_tables[] = { { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) }, }; +static const struct drm_i915_reg_table gen9_blt_reg_tables[] = { + { gen9_blt_regs, ARRAY_SIZE(gen9_blt_regs) }, +}; + static u32 gen7_render_get_cmd_length_mask(u32 cmd_header) { u32 client = cmd_header >> INSTR_CLIENT_SHIFT; @@ -683,6 +756,17 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header) return 0; } +static u32 gen9_blt_get_cmd_length_mask(u32 cmd_header) +{ + u32 client = cmd_header >> INSTR_CLIENT_SHIFT; + + if (client == INSTR_MI_CLIENT || client == INSTR_BC_CLIENT) + return 0xFF; + + DRM_DEBUG_DRIVER("CMD: Abnormal blt cmd length! 0x%08X\n", cmd_header); + return 0; +} + static bool validate_cmds_sorted(const struct intel_engine_cs *engine, const struct drm_i915_cmd_table *cmd_tables, int cmd_table_count) @@ -840,7 +924,8 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) int cmd_table_count; int ret; - if (!IS_GEN7(engine->i915)) + if (!IS_GEN7(engine->i915) && !(IS_GEN9(engine->i915) && + engine->id == BCS)) return; switch (engine->id) { @@ -861,7 +946,6 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) engine->reg_tables = ivb_render_reg_tables; engine->reg_table_count = ARRAY_SIZE(ivb_render_reg_tables); } - engine->get_cmd_length_mask = gen7_render_get_cmd_length_mask; break; case VCS: @@ -870,7 +954,16 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; break; case BCS: - if (IS_HASWELL(engine->i915)) { + engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; + if (IS_GEN9(engine->i915)) { + cmd_tables = gen9_blt_cmd_table; + cmd_table_count = ARRAY_SIZE(gen9_blt_cmd_table); + engine->get_cmd_length_mask = + gen9_blt_get_cmd_length_mask; + + /* BCS Engine unsafe without parser */ + engine->flags |= I915_ENGINE_REQUIRES_CMD_PARSER; + } else if (IS_HASWELL(engine->i915)) { cmd_tables = hsw_blt_ring_cmd_table; cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmd_table); } else { @@ -878,15 +971,17 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) cmd_table_count = ARRAY_SIZE(gen7_blt_cmd_table); } - if (IS_HASWELL(engine->i915)) { + if (IS_GEN9(engine->i915)) { + engine->reg_tables = gen9_blt_reg_tables; + engine->reg_table_count = + ARRAY_SIZE(gen9_blt_reg_tables); + } else if (IS_HASWELL(engine->i915)) { engine->reg_tables = hsw_blt_reg_tables; engine->reg_table_count = ARRAY_SIZE(hsw_blt_reg_tables); } else { engine->reg_tables = ivb_blt_reg_tables; engine->reg_table_count = ARRAY_SIZE(ivb_blt_reg_tables); } - - engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; break; case VECS: cmd_tables = hsw_vebox_cmd_table; @@ -1260,9 +1355,9 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, } /* - * If the batch buffer contains a chained batch, return an - * error that tells the caller to abort and dispatch the - * workload as a non-secure batch. + * We don't try to handle BATCH_BUFFER_START because it adds + * non-trivial complexity. Instead we abort the scan and return + * and error to indicate that the batch is unsafe. */ if (desc->cmd.value == MI_BATCH_BUFFER_START) { ret = -EACCES; @@ -1342,6 +1437,7 @@ int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv) * the parser enabled. * 9. Don't whitelist or handle oacontrol specially, as ownership * for oacontrol state is moving to i915-perf. + * 10. Support for Gen9 BCS Parsing */ - return 9; + return 10; } diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index 6f5e1e18e530af..47d178817a29dd 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -159,7 +159,8 @@ int intel_sanitize_enable_ppgtt(struct drm_i915_private *dev_priv, if (enable_ppgtt == 0 && INTEL_GEN(dev_priv) < 9) return 0; - if (enable_ppgtt == 1) + /* Full PPGTT is required by the Gen9 cmdparser */ + if (enable_ppgtt == 1 && INTEL_GEN(dev_priv) != 9) return 1; if (enable_ppgtt == 2 && has_full_ppgtt) diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 2e706f1abe64f0..ca2845609e7e1a 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -703,6 +703,10 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) */ #define BCS_SWCTRL _MMIO(0x22200) +/* There are 16 GPR registers */ +#define BCS_GPR(n) _MMIO(0x22600 + (n) * 8) +#define BCS_GPR_UDW(n) _MMIO(0x22600 + (n) * 8 + 4) + #define GPGPU_THREADS_DISPATCHED _MMIO(0x2290) #define GPGPU_THREADS_DISPATCHED_UDW _MMIO(0x2290 + 4) #define HS_INVOCATION_COUNT _MMIO(0x2300) From 54ffc33792e48a128cd7a9105562d0078f2f467c Mon Sep 17 00:00:00 2001 From: Jon Bloomfield Date: Thu, 27 Sep 2018 10:23:17 -0700 Subject: [PATCH 123/155] drm/i915/cmdparser: Use explicit goto for error paths commit 0546a29cd884fb8184731c79ab008927ca8859d0 upstream. In the next patch we will be adding a second valid termination condition which will require a small amount of refactoring to share logic with the BB_END case. Refactor all error conditions to jump to a dedicated exit path, with 'break' reserved only for a successful parse. Signed-off-by: Jon Bloomfield Cc: Tony Luck Cc: Dave Airlie Cc: Takashi Iwai Cc: Tyler Hicks Signed-off-by: Mika Kuoppala Reviewed-by: Chris Wilson Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_cmd_parser.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 09f1672488bbe3..1ff3c8fd228fb1 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -1337,21 +1337,15 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, do { u32 length; - if (*cmd == MI_BATCH_BUFFER_END) { - if (needs_clflush_after) { - void *ptr = page_mask_bits(shadow_batch_obj->mm.mapping); - drm_clflush_virt_range(ptr, - (void *)(cmd + 1) - ptr); - } + if (*cmd == MI_BATCH_BUFFER_END) break; - } desc = find_cmd(engine, *cmd, desc, &default_desc); if (!desc) { DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n", *cmd); ret = -EINVAL; - break; + goto err; } /* @@ -1361,7 +1355,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, */ if (desc->cmd.value == MI_BATCH_BUFFER_START) { ret = -EACCES; - break; + goto err; } if (desc->flags & CMD_DESC_FIXED) @@ -1375,22 +1369,29 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, length, batch_end - cmd); ret = -EINVAL; - break; + goto err; } if (!check_cmd(engine, desc, cmd, length)) { ret = -EACCES; - break; + goto err; } cmd += length; if (cmd >= batch_end) { DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n"); ret = -EINVAL; - break; + goto err; } } while (1); + if (needs_clflush_after) { + void *ptr = page_mask_bits(shadow_batch_obj->mm.mapping); + + drm_clflush_virt_range(ptr, (void *)(cmd + 1) - ptr); + } + +err: i915_gem_object_unpin_map(shadow_batch_obj); return ret; } From 6c410451e9dfa163687b51f8d98d3f55f83d7bac Mon Sep 17 00:00:00 2001 From: Jon Bloomfield Date: Thu, 20 Sep 2018 09:58:36 -0700 Subject: [PATCH 124/155] drm/i915/cmdparser: Add support for backward jumps commit f8c08d8faee5567803c8c533865296ca30286bbf upstream. To keep things manageable, the pre-gen9 cmdparser does not attempt to track any form of nested BB_START's. This did not prevent usermode from using nested starts, or even chained batches because the cmdparser is not strictly enforced pre gen9. Instead, the existence of a nested BB_START would cause the batch to be emitted in insecure mode, and any privileged capabilities would not be available. For Gen9, the cmdparser becomes mandatory (for BCS at least), and so not providing any form of nested BB_START support becomes overly restrictive. Any such batch will simply not run. We make heavy use of backward jumps in igt, and it is much easier to add support for this restricted subset of nested jumps, than to rewrite the whole of our test suite to avoid them. Add the required logic to support limited backward jumps, to instructions that have already been validated by the parser. Note that it's not sufficient to simply approve any BB_START that jumps backwards in the buffer because this would allow an attacker to embed a rogue instruction sequence within the operand words of a harmless instruction (say LRI) and jump to that. We introduce a bit array to track every instr offset successfully validated, and test the target of BB_START against this. If the target offset hits, it is re-written to the same offset in the shadow buffer and the BB_START cmd is allowed. Note: This patch deliberately ignores checkpatch issues in the cmdtables, in order to match the style of the surrounding code. We'll correct the entire file in one go in a later patch. v2: set dispatch secure late (Mika) v3: rebase (Mika) v4: Clear whitelist on each parse Minor review updates (Chris) v5: Correct backward jump batching v6: fix compilation error due to struct eb shuffle (Mika) Signed-off-by: Jon Bloomfield Cc: Tony Luck Cc: Dave Airlie Cc: Takashi Iwai Cc: Tyler Hicks Signed-off-by: Mika Kuoppala Reviewed-by: Chris Wilson Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_cmd_parser.c | 151 +++++++++++++++++++-- drivers/gpu/drm/i915/i915_drv.h | 9 +- drivers/gpu/drm/i915/i915_gem_context.c | 5 + drivers/gpu/drm/i915/i915_gem_context.h | 6 + drivers/gpu/drm/i915/i915_gem_execbuffer.c | 32 +++-- 5 files changed, 177 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 1ff3c8fd228fb1..23d220fbca5fe4 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -481,6 +481,19 @@ static const struct drm_i915_cmd_descriptor gen9_blt_cmds[] = { .reg = { .offset = 1, .mask = 0x007FFFFC } ), CMD( MI_LOAD_REGISTER_REG, SMI, !F, 0xFF, W, .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 } ), + + /* + * We allow BB_START but apply further checks. We just sanitize the + * basic fields here. + */ +#define MI_BB_START_OPERAND_MASK GENMASK(SMI-1, 0) +#define MI_BB_START_OPERAND_EXPECT (MI_BATCH_PPGTT_HSW | 1) + CMD( MI_BATCH_BUFFER_START_GEN8, SMI, !F, 0xFF, B, + .bits = {{ + .offset = 0, + .mask = MI_BB_START_OPERAND_MASK, + .expected = MI_BB_START_OPERAND_EXPECT, + }}, ), }; static const struct drm_i915_cmd_descriptor noop_desc = @@ -1292,15 +1305,113 @@ static bool check_cmd(const struct intel_engine_cs *engine, return true; } +static int check_bbstart(const struct i915_gem_context *ctx, + u32 *cmd, u32 offset, u32 length, + u32 batch_len, + u64 batch_start, + u64 shadow_batch_start) +{ + u64 jump_offset, jump_target; + u32 target_cmd_offset, target_cmd_index; + + /* For igt compatibility on older platforms */ + if (CMDPARSER_USES_GGTT(ctx->i915)) { + DRM_DEBUG("CMD: Rejecting BB_START for ggtt based submission\n"); + return -EACCES; + } + + if (length != 3) { + DRM_DEBUG("CMD: Recursive BB_START with bad length(%u)\n", + length); + return -EINVAL; + } + + jump_target = *(u64*)(cmd+1); + jump_offset = jump_target - batch_start; + + /* + * Any underflow of jump_target is guaranteed to be outside the range + * of a u32, so >= test catches both too large and too small + */ + if (jump_offset >= batch_len) { + DRM_DEBUG("CMD: BB_START to 0x%llx jumps out of BB\n", + jump_target); + return -EINVAL; + } + + /* + * This cannot overflow a u32 because we already checked jump_offset + * is within the BB, and the batch_len is a u32 + */ + target_cmd_offset = lower_32_bits(jump_offset); + target_cmd_index = target_cmd_offset / sizeof(u32); + + *(u64*)(cmd + 1) = shadow_batch_start + target_cmd_offset; + + if (target_cmd_index == offset) + return 0; + + if (ctx->jump_whitelist_cmds <= target_cmd_index) { + DRM_DEBUG("CMD: Rejecting BB_START - truncated whitelist array\n"); + return -EINVAL; + } else if (!test_bit(target_cmd_index, ctx->jump_whitelist)) { + DRM_DEBUG("CMD: BB_START to 0x%llx not a previously executed cmd\n", + jump_target); + return -EINVAL; + } + + return 0; +} + +static void init_whitelist(struct i915_gem_context *ctx, u32 batch_len) +{ + const u32 batch_cmds = DIV_ROUND_UP(batch_len, sizeof(u32)); + const u32 exact_size = BITS_TO_LONGS(batch_cmds); + u32 next_size = BITS_TO_LONGS(roundup_pow_of_two(batch_cmds)); + unsigned long *next_whitelist; + + if (CMDPARSER_USES_GGTT(ctx->i915)) + return; + + if (batch_cmds <= ctx->jump_whitelist_cmds) { + memset(ctx->jump_whitelist, 0, exact_size * sizeof(u32)); + return; + } + +again: + next_whitelist = kcalloc(next_size, sizeof(long), GFP_KERNEL); + if (next_whitelist) { + kfree(ctx->jump_whitelist); + ctx->jump_whitelist = next_whitelist; + ctx->jump_whitelist_cmds = + next_size * BITS_PER_BYTE * sizeof(long); + return; + } + + if (next_size > exact_size) { + next_size = exact_size; + goto again; + } + + DRM_DEBUG("CMD: Failed to extend whitelist. BB_START may be disallowed\n"); + memset(ctx->jump_whitelist, 0, + BITS_TO_LONGS(ctx->jump_whitelist_cmds) * sizeof(u32)); + + return; +} + #define LENGTH_BIAS 2 /** * i915_parse_cmds() - parse a submitted batch buffer for privilege violations + * @ctx: the context in which the batch is to execute * @engine: the engine on which the batch is to execute * @batch_obj: the batch buffer in question - * @shadow_batch_obj: copy of the batch buffer in question + * @batch_start: Canonical base address of batch * @batch_start_offset: byte offset in the batch at which execution starts * @batch_len: length of the commands in batch_obj + * @shadow_batch_obj: copy of the batch buffer in question + * @shadow_batch_start: Canonical base address of shadow_batch_obj * * Parses the specified batch buffer looking for privilege violations as * described in the overview. @@ -1308,13 +1419,17 @@ static bool check_cmd(const struct intel_engine_cs *engine, * Return: non-zero if the parser finds violations or otherwise fails; -EACCES * if the batch appears legal but should use hardware parsing */ -int intel_engine_cmd_parser(struct intel_engine_cs *engine, + +int intel_engine_cmd_parser(struct i915_gem_context *ctx, + struct intel_engine_cs *engine, struct drm_i915_gem_object *batch_obj, - struct drm_i915_gem_object *shadow_batch_obj, + u64 batch_start, u32 batch_start_offset, - u32 batch_len) + u32 batch_len, + struct drm_i915_gem_object *shadow_batch_obj, + u64 shadow_batch_start) { - u32 *cmd, *batch_end; + u32 *cmd, *batch_end, offset = 0; struct drm_i915_cmd_descriptor default_desc = noop_desc; const struct drm_i915_cmd_descriptor *desc = &default_desc; bool needs_clflush_after = false; @@ -1328,6 +1443,8 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, return PTR_ERR(cmd); } + init_whitelist(ctx, batch_len); + /* * We use the batch length as size because the shadow object is as * large or larger and copy_batch() will write MI_NOPs to the extra @@ -1348,16 +1465,6 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, goto err; } - /* - * We don't try to handle BATCH_BUFFER_START because it adds - * non-trivial complexity. Instead we abort the scan and return - * and error to indicate that the batch is unsafe. - */ - if (desc->cmd.value == MI_BATCH_BUFFER_START) { - ret = -EACCES; - goto err; - } - if (desc->flags & CMD_DESC_FIXED) length = desc->length.fixed; else @@ -1377,7 +1484,21 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, goto err; } + if (desc->cmd.value == MI_BATCH_BUFFER_START) { + ret = check_bbstart(ctx, cmd, offset, length, + batch_len, batch_start, + shadow_batch_start); + + if (ret) + goto err; + break; + } + + if (ctx->jump_whitelist_cmds > offset) + set_bit(offset, ctx->jump_whitelist); + cmd += length; + offset += length; if (cmd >= batch_end) { DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n"); ret = -EINVAL; diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e50030c606656f..e7e5f94e97f75b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3857,11 +3857,14 @@ const char *i915_cache_level_str(struct drm_i915_private *i915, int type); int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv); void intel_engine_init_cmd_parser(struct intel_engine_cs *engine); void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine); -int intel_engine_cmd_parser(struct intel_engine_cs *engine, +int intel_engine_cmd_parser(struct i915_gem_context *cxt, + struct intel_engine_cs *engine, struct drm_i915_gem_object *batch_obj, - struct drm_i915_gem_object *shadow_batch_obj, + u64 user_batch_start, u32 batch_start_offset, - u32 batch_len); + u32 batch_len, + struct drm_i915_gem_object *shadow_batch_obj, + u64 shadow_batch_start); /* i915_perf.c */ extern void i915_perf_init(struct drm_i915_private *dev_priv); diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 8afd2ce59b8d50..3925a63c1661fc 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -141,6 +141,8 @@ static void i915_gem_context_free(struct i915_gem_context *ctx) __i915_gem_object_release_unless_active(ce->state->obj); } + kfree(ctx->jump_whitelist); + kfree(ctx->name); put_pid(ctx->pid); @@ -321,6 +323,9 @@ __create_hw_context(struct drm_i915_private *dev_priv, else ctx->ggtt_offset_bias = I915_GTT_PAGE_SIZE; + ctx->jump_whitelist = NULL; + ctx->jump_whitelist_cmds = 0; + return ctx; err_pid: diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h index 44688e22a5c227..b651c5f427b94f 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.h +++ b/drivers/gpu/drm/i915/i915_gem_context.h @@ -181,6 +181,12 @@ struct i915_gem_context { /** remap_slice: Bitmask of cache lines that need remapping */ u8 remap_slice; + /** jump_whitelist: Bit array for tracking cmds during cmdparsing */ + unsigned long *jump_whitelist; + + /** jump_whitelist_cmds: No of cmd slots available */ + u32 jump_whitelist_cmds; + /** handles_vma: rbtree to look up our context specific obj/vma for * the user handle. (user handles are per fd, but the binding is * per vm, which may be one per context or shared with the global GTT) diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 12d44ea36d24a7..d99d05a91032e6 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1927,7 +1927,6 @@ shadow_batch_pin(struct i915_execbuffer *eb, struct drm_i915_gem_object *obj) if (CMDPARSER_USES_GGTT(dev_priv)) { flags = PIN_GLOBAL; vm = &dev_priv->ggtt.base; - eb->batch_flags |= I915_DISPATCH_SECURE; } else if (eb->vm->has_read_only) { flags = PIN_USER; vm = eb->vm; @@ -1944,6 +1943,8 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb) { struct drm_i915_gem_object *shadow_batch_obj; struct i915_vma *vma; + u64 batch_start; + u64 shadow_batch_start; int err; shadow_batch_obj = i915_gem_batch_pool_get(&eb->engine->batch_pool, @@ -1951,12 +1952,27 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb) if (IS_ERR(shadow_batch_obj)) return ERR_CAST(shadow_batch_obj); - err = intel_engine_cmd_parser(eb->engine, + vma = shadow_batch_pin(eb, shadow_batch_obj); + if (IS_ERR(vma)) + goto out; + + batch_start = gen8_canonical_addr(eb->batch->node.start) + + eb->batch_start_offset; + + shadow_batch_start = gen8_canonical_addr(vma->node.start); + + err = intel_engine_cmd_parser(eb->ctx, + eb->engine, eb->batch->obj, - shadow_batch_obj, + batch_start, eb->batch_start_offset, - eb->batch_len); + eb->batch_len, + shadow_batch_obj, + shadow_batch_start); + if (err) { + i915_vma_unpin(vma); + /* * Unsafe GGTT-backed buffers can still be submitted safely * as non-secure. @@ -1968,12 +1984,9 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb) vma = NULL; else vma = ERR_PTR(err); - goto out; - } - vma = shadow_batch_pin(eb, shadow_batch_obj); - if (IS_ERR(vma)) goto out; + } eb->vma[eb->buffer_count] = i915_vma_get(vma); eb->flags[eb->buffer_count] = @@ -1984,6 +1997,9 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb) eb->batch = vma; /* eb->batch_len unchanged */ + if (CMDPARSER_USES_GGTT(eb->i915)) + eb->batch_flags |= I915_DISPATCH_SECURE; + out: i915_gem_object_unpin_pages(shadow_batch_obj); return vma; From b08903f89ee6e7b8116070bcee13eda9fd357aca Mon Sep 17 00:00:00 2001 From: Jon Bloomfield Date: Thu, 20 Sep 2018 09:45:10 -0700 Subject: [PATCH 125/155] drm/i915/cmdparser: Ignore Length operands during command matching commit 926abff21a8f29ef159a3ac893b05c6e50e043c3 upstream. Some of the gen instruction macros (e.g. MI_DISPLAY_FLIP) have the length directly encoded in them. Since these are used directly in the tables, the Length becomes part of the comparison used for matching during parsing. Thus, if the cmd being parsed has a different length to that in the table, it is not matched and the cmd is accepted via the default variable length path. Fix by masking out everything except the Opcode in the cmd tables Cc: Tony Luck Cc: Dave Airlie Cc: Takashi Iwai Cc: Tyler Hicks Signed-off-by: Jon Bloomfield Reviewed-by: Chris Wilson Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_cmd_parser.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 23d220fbca5fe4..5c2ae816ac3217 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -187,7 +187,7 @@ struct drm_i915_cmd_table { #define CMD(op, opm, f, lm, fl, ...) \ { \ .flags = (fl) | ((f) ? CMD_DESC_FIXED : 0), \ - .cmd = { (op), ~0u << (opm) }, \ + .cmd = { (op & ~0u << (opm)), ~0u << (opm) }, \ .length = { (lm) }, \ __VA_ARGS__ \ } From d302d64a90b0b390ead3f70832aac2662e34323b Mon Sep 17 00:00:00 2001 From: Uma Shankar Date: Tue, 7 Aug 2018 21:15:35 +0530 Subject: [PATCH 126/155] drm/i915: Lower RM timeout to avoid DSI hard hangs commit 1d85a299c4db57c55e0229615132c964d17aa765 upstream. In BXT/APL, device 2 MMIO reads from MIPI controller requires its PLL to be turned ON. When MIPI PLL is turned off (MIPI Display is not active or connected), and someone (host or GT engine) tries to read MIPI registers, it causes hard hang. This is a hardware restriction or limitation. Driver by itself doesn't read MIPI registers when MIPI display is off. But any userspace application can submit unprivileged batch buffer for execution. In that batch buffer there can be mmio reads. And these reads are allowed even for unprivileged applications. If these register reads are for MIPI DSI controller and MIPI display is not active during that time, then the MMIO read operation causes system hard hang and only way to recover is hard reboot. A genuine process/application won't submit batch buffer like this and doesn't cause any issue. But on a compromised system, a malign userspace process/app can generate such batch buffer and can trigger system hard hang (denial of service attack). The fix is to lower the internal MMIO timeout value to an optimum value of 950us as recommended by hardware team. If the timeout is beyond 1ms (which will hit for any value we choose if MMIO READ on a DSI specific register is performed without PLL ON), it causes the system hang. But if the timeout value is lower than it will be below the threshold (even if timeout happens) and system will not get into a hung state. This will avoid a system hang without losing any programming or GT interrupts, taking the worst case of lowest CDCLK frequency and early DC5 abort into account. Signed-off-by: Uma Shankar Reviewed-by: Jon Bloomfield Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_reg.h | 4 ++++ drivers/gpu/drm/i915/intel_pm.c | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index ca2845609e7e1a..e95547aad31247 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -6726,6 +6726,10 @@ enum { #define SKL_CSR_DC5_DC6_COUNT _MMIO(0x8002C) #define BXT_CSR_DC3_DC5_COUNT _MMIO(0x80038) +/* Display Internal Timeout Register */ +#define RM_TIMEOUT _MMIO(0x42060) +#define MMIO_TIMEOUT_US(us) ((us) << 0) + /* interrupts */ #define DE_MASTER_IRQ_CONTROL (1 << 31) #define DE_SPRITEB_FLIP_DONE (1 << 29) diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index cb377b003321a8..391c248a57553b 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -121,6 +121,14 @@ static void bxt_init_clock_gating(struct drm_i915_private *dev_priv) */ I915_WRITE(GEN9_CLKGATE_DIS_0, I915_READ(GEN9_CLKGATE_DIS_0) | PWM1_GATING_DIS | PWM2_GATING_DIS); + + /* + * Lower the display internal timeout. + * This is needed to avoid any hard hangs when DSI port PLL + * is off and a MMIO access is attempted by any privilege + * application, using batch buffers or any other means. + */ + I915_WRITE(RM_TIMEOUT, MMIO_TIMEOUT_US(950)); } static void glk_init_clock_gating(struct drm_i915_private *dev_priv) From 1a5a64e0bde89f5c768353fce5307b9816c00af7 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Mon, 9 Jul 2018 18:24:27 +0300 Subject: [PATCH 127/155] drm/i915/gen8+: Add RC6 CTX corruption WA commit 7e34f4e4aad3fd34c02b294a3cf2321adf5b4438 upstream. In some circumstances the RC6 context can get corrupted. We can detect this and take the required action, that is disable RC6 and runtime PM. The HW recovers from the corrupted state after a system suspend/resume cycle, so detect the recovery and re-enable RC6 and runtime PM. v2: rebase (Mika) v3: - Move intel_suspend_gt_powersave() to the end of the GEM suspend sequence. - Add commit message. v4: - Rebased on intel_uncore_forcewake_put(i915->uncore, ...) API change. v5: - Rebased on latest upstream gt_pm refactoring. Signed-off-by: Imre Deak Signed-off-by: Mika Kuoppala Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_drv.c | 2 + drivers/gpu/drm/i915/i915_drv.h | 6 +- drivers/gpu/drm/i915/i915_gem.c | 6 + drivers/gpu/drm/i915/i915_gem_request.c | 4 + drivers/gpu/drm/i915/i915_reg.h | 2 + drivers/gpu/drm/i915/intel_drv.h | 3 + drivers/gpu/drm/i915/intel_pm.c | 153 +++++++++++++++++++++--- 7 files changed, 158 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 9b642dd040cfd7..02a2af7c8166fb 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -1564,6 +1564,7 @@ static int i915_drm_suspend_late(struct drm_device *dev, bool hibernation) disable_rpm_wakeref_asserts(dev_priv); intel_display_set_init_power(dev_priv, false); + i915_rc6_ctx_wa_suspend(dev_priv); fw_csr = !IS_GEN9_LP(dev_priv) && suspend_to_idle(dev_priv) && dev_priv->csr.dmc_payload; @@ -1800,6 +1801,7 @@ static int i915_drm_resume_early(struct drm_device *dev) intel_display_set_init_power(dev_priv, true); i915_gem_sanitize(dev_priv); + i915_rc6_ctx_wa_resume(dev_priv); enable_rpm_wakeref_asserts(dev_priv); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e7e5f94e97f75b..a5fb7404b29ea1 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1320,6 +1320,7 @@ struct intel_gen6_power_mgmt { enum { LOW_POWER, BETWEEN, HIGH_POWER } power; bool enabled; + bool ctx_corrupted; struct delayed_work autoenable_work; atomic_t num_waiters; atomic_t boosts; @@ -3025,9 +3026,12 @@ intel_info(const struct drm_i915_private *dev_priv) /* Early gen2 have a totally busted CS tlb and require pinned batches. */ #define HAS_BROKEN_CS_TLB(dev_priv) (IS_I830(dev_priv) || IS_I845G(dev_priv)) +#define NEEDS_RC6_CTX_CORRUPTION_WA(dev_priv) \ + (IS_BROADWELL(dev_priv) || INTEL_GEN(dev_priv) == 9) + /* WaRsDisableCoarsePowerGating:skl,bxt */ #define NEEDS_WaRsDisableCoarsePowerGating(dev_priv) \ - (IS_SKL_GT3(dev_priv) || IS_SKL_GT4(dev_priv)) + (INTEL_GEN(dev_priv) == 9) /* * dp aux and gmbus irq on gen4 seems to be able to generate legacy interrupts diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 785d192b27c804..9263b65720bc61 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -3243,6 +3243,12 @@ i915_gem_idle_work_handler(struct work_struct *work) if (INTEL_GEN(dev_priv) >= 6) gen6_rps_idle(dev_priv); + + if (NEEDS_RC6_CTX_CORRUPTION_WA(dev_priv)) { + i915_rc6_ctx_wa_check(dev_priv); + intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); + } + intel_runtime_pm_put(dev_priv); out_unlock: mutex_unlock(&dev->struct_mutex); diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index 813a3b546d6e2d..1d556dcbd65626 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -252,6 +252,10 @@ static void mark_busy(struct drm_i915_private *i915) GEM_BUG_ON(!i915->gt.active_requests); intel_runtime_pm_get_noresume(i915); + + if (NEEDS_RC6_CTX_CORRUPTION_WA(i915)) + intel_uncore_forcewake_get(i915, FORCEWAKE_ALL); + i915->gt.awake = true; intel_enable_gt_powersave(i915); diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index e95547aad31247..1db70350af0bf1 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -358,6 +358,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define GEN8_CONFIG0 _MMIO(0xD00) #define GEN9_DEFAULT_FIXES (1 << 3 | 1 << 2 | 1 << 1) +#define GEN8_RC6_CTX_INFO _MMIO(0x8504) + #define GAC_ECO_BITS _MMIO(0x14090) #define ECOBITS_SNB_BIT (1<<13) #define ECOBITS_PPGTT_CACHE64B (3<<8) diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h index 3adb9c3b412e7a..cba10cdab2a962 100644 --- a/drivers/gpu/drm/i915/intel_drv.h +++ b/drivers/gpu/drm/i915/intel_drv.h @@ -1838,6 +1838,9 @@ void intel_enable_gt_powersave(struct drm_i915_private *dev_priv); void intel_autoenable_gt_powersave(struct drm_i915_private *dev_priv); void intel_disable_gt_powersave(struct drm_i915_private *dev_priv); void intel_suspend_gt_powersave(struct drm_i915_private *dev_priv); +bool i915_rc6_ctx_wa_check(struct drm_i915_private *i915); +void i915_rc6_ctx_wa_suspend(struct drm_i915_private *i915); +void i915_rc6_ctx_wa_resume(struct drm_i915_private *i915); void gen6_rps_busy(struct drm_i915_private *dev_priv); void gen6_rps_reset_ei(struct drm_i915_private *dev_priv); void gen6_rps_idle(struct drm_i915_private *dev_priv); diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 391c248a57553b..674410682ccc2c 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -6282,19 +6282,23 @@ static void gen9_disable_rps(struct drm_i915_private *dev_priv) I915_WRITE(GEN6_RP_CONTROL, 0); } -static void gen6_disable_rps(struct drm_i915_private *dev_priv) +static void gen6_disable_rc6(struct drm_i915_private *dev_priv) { I915_WRITE(GEN6_RC_CONTROL, 0); +} + +static void gen6_disable_rps(struct drm_i915_private *dev_priv) +{ I915_WRITE(GEN6_RPNSWREQ, 1 << 31); I915_WRITE(GEN6_RP_CONTROL, 0); } -static void cherryview_disable_rps(struct drm_i915_private *dev_priv) +static void cherryview_disable_rc6(struct drm_i915_private *dev_priv) { I915_WRITE(GEN6_RC_CONTROL, 0); } -static void valleyview_disable_rps(struct drm_i915_private *dev_priv) +static void valleyview_disable_rc6(struct drm_i915_private *dev_priv) { /* we're doing forcewake before Disabling RC6, * This what the BIOS expects when going into suspend */ @@ -6545,7 +6549,8 @@ static void gen9_enable_rc6(struct drm_i915_private *dev_priv) I915_WRITE(GEN9_RENDER_PG_IDLE_HYSTERESIS, 25); /* 3a: Enable RC6 */ - if (intel_enable_rc6() & INTEL_RC6_ENABLE) + if (!dev_priv->rps.ctx_corrupted && + intel_enable_rc6() & INTEL_RC6_ENABLE) rc6_mask = GEN6_RC_CTL_RC6_ENABLE; DRM_INFO("RC6 %s\n", onoff(rc6_mask & GEN6_RC_CTL_RC6_ENABLE)); I915_WRITE(GEN6_RC6_THRESHOLD, 37500); /* 37.5/125ms per EI */ @@ -6594,7 +6599,8 @@ static void gen8_enable_rps(struct drm_i915_private *dev_priv) I915_WRITE(GEN6_RC6_THRESHOLD, 50000); /* 50/125ms per EI */ /* 3: Enable RC6 */ - if (intel_enable_rc6() & INTEL_RC6_ENABLE) + if (!dev_priv->rps.ctx_corrupted && + intel_enable_rc6() & INTEL_RC6_ENABLE) rc6_mask = GEN6_RC_CTL_RC6_ENABLE; intel_print_rc6_info(dev_priv, rc6_mask); if (IS_BROADWELL(dev_priv)) @@ -7775,6 +7781,95 @@ static void intel_init_emon(struct drm_i915_private *dev_priv) dev_priv->ips.corr = (lcfuse & LCFUSE_HIV_MASK); } +static bool i915_rc6_ctx_corrupted(struct drm_i915_private *dev_priv) +{ + return !I915_READ(GEN8_RC6_CTX_INFO); +} + +static void i915_rc6_ctx_wa_init(struct drm_i915_private *i915) +{ + if (!NEEDS_RC6_CTX_CORRUPTION_WA(i915)) + return; + + if (i915_rc6_ctx_corrupted(i915)) { + DRM_INFO("RC6 context corrupted, disabling runtime power management\n"); + i915->rps.ctx_corrupted = true; + intel_runtime_pm_get(i915); + } +} + +static void i915_rc6_ctx_wa_cleanup(struct drm_i915_private *i915) +{ + if (i915->rps.ctx_corrupted) { + intel_runtime_pm_put(i915); + i915->rps.ctx_corrupted = false; + } +} + +/** + * i915_rc6_ctx_wa_suspend - system suspend sequence for the RC6 CTX WA + * @i915: i915 device + * + * Perform any steps needed to clean up the RC6 CTX WA before system suspend. + */ +void i915_rc6_ctx_wa_suspend(struct drm_i915_private *i915) +{ + if (i915->rps.ctx_corrupted) + intel_runtime_pm_put(i915); +} + +/** + * i915_rc6_ctx_wa_resume - system resume sequence for the RC6 CTX WA + * @i915: i915 device + * + * Perform any steps needed to re-init the RC6 CTX WA after system resume. + */ +void i915_rc6_ctx_wa_resume(struct drm_i915_private *i915) +{ + if (!i915->rps.ctx_corrupted) + return; + + if (i915_rc6_ctx_corrupted(i915)) { + intel_runtime_pm_get(i915); + return; + } + + DRM_INFO("RC6 context restored, re-enabling runtime power management\n"); + i915->rps.ctx_corrupted = false; +} + +static void intel_disable_rc6(struct drm_i915_private *dev_priv); + +/** + * i915_rc6_ctx_wa_check - check for a new RC6 CTX corruption + * @i915: i915 device + * + * Check if an RC6 CTX corruption has happened since the last check and if so + * disable RC6 and runtime power management. + * + * Return false if no context corruption has happened since the last call of + * this function, true otherwise. +*/ +bool i915_rc6_ctx_wa_check(struct drm_i915_private *i915) +{ + if (!NEEDS_RC6_CTX_CORRUPTION_WA(i915)) + return false; + + if (i915->rps.ctx_corrupted) + return false; + + if (!i915_rc6_ctx_corrupted(i915)) + return false; + + DRM_NOTE("RC6 context corruption, disabling runtime power management\n"); + + intel_disable_rc6(i915); + i915->rps.ctx_corrupted = true; + intel_runtime_pm_get_noresume(i915); + + return true; +} + void intel_init_gt_powersave(struct drm_i915_private *dev_priv) { /* @@ -7789,6 +7884,8 @@ void intel_init_gt_powersave(struct drm_i915_private *dev_priv) mutex_lock(&dev_priv->drm.struct_mutex); mutex_lock(&dev_priv->rps.hw_lock); + i915_rc6_ctx_wa_init(dev_priv); + /* Initialize RPS limits (for userspace) */ if (IS_CHERRYVIEW(dev_priv)) cherryview_init_gt_powersave(dev_priv); @@ -7838,6 +7935,8 @@ void intel_cleanup_gt_powersave(struct drm_i915_private *dev_priv) if (IS_VALLEYVIEW(dev_priv)) valleyview_cleanup_gt_powersave(dev_priv); + i915_rc6_ctx_wa_cleanup(dev_priv); + if (!i915.enable_rc6) intel_runtime_pm_put(dev_priv); } @@ -7869,27 +7968,47 @@ void intel_sanitize_gt_powersave(struct drm_i915_private *dev_priv) gen6_reset_rps_interrupts(dev_priv); } -void intel_disable_gt_powersave(struct drm_i915_private *dev_priv) +static void __intel_disable_rc6(struct drm_i915_private *dev_priv) { - if (!READ_ONCE(dev_priv->rps.enabled)) - return; + if (INTEL_GEN(dev_priv) >= 9) + gen9_disable_rc6(dev_priv); + else if (IS_CHERRYVIEW(dev_priv)) + cherryview_disable_rc6(dev_priv); + else if (IS_VALLEYVIEW(dev_priv)) + valleyview_disable_rc6(dev_priv); + else if (INTEL_GEN(dev_priv) >= 6) + gen6_disable_rc6(dev_priv); +} +static void intel_disable_rc6(struct drm_i915_private *dev_priv) +{ mutex_lock(&dev_priv->rps.hw_lock); + __intel_disable_rc6(dev_priv); + mutex_unlock(&dev_priv->rps.hw_lock); +} - if (INTEL_GEN(dev_priv) >= 9) { - gen9_disable_rc6(dev_priv); +static void intel_disable_rps(struct drm_i915_private *dev_priv) +{ + if (INTEL_GEN(dev_priv) >= 9) gen9_disable_rps(dev_priv); - } else if (IS_CHERRYVIEW(dev_priv)) { - cherryview_disable_rps(dev_priv); - } else if (IS_VALLEYVIEW(dev_priv)) { - valleyview_disable_rps(dev_priv); - } else if (INTEL_GEN(dev_priv) >= 6) { + else if (INTEL_GEN(dev_priv) >= 6) gen6_disable_rps(dev_priv); - } else if (IS_IRONLAKE_M(dev_priv)) { + else if (IS_IRONLAKE_M(dev_priv)) ironlake_disable_drps(dev_priv); - } +} + +void intel_disable_gt_powersave(struct drm_i915_private *dev_priv) +{ + if (!READ_ONCE(dev_priv->rps.enabled)) + return; + + mutex_lock(&dev_priv->rps.hw_lock); + + __intel_disable_rc6(dev_priv); + intel_disable_rps(dev_priv); dev_priv->rps.enabled = false; + mutex_unlock(&dev_priv->rps.hw_lock); } From e78af200c633837b0ad98dd9d8612e0e5bbe83c7 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 11 Nov 2019 08:13:24 -0800 Subject: [PATCH 128/155] drm/i915/cmdparser: Fix jump whitelist clearing commit ea0b163b13ffc52818c079adb00d55e227a6da6f upstream. When a jump_whitelist bitmap is reused, it needs to be cleared. Currently this is done with memset() and the size calculation assumes bitmaps are made of 32-bit words, not longs. So on 64-bit architectures, only the first half of the bitmap is cleared. If some whitelist bits are carried over between successive batches submitted on the same context, this will presumably allow embedding the rogue instructions that we're trying to reject. Use bitmap_zero() instead, which gets the calculation right. Fixes: f8c08d8faee5 ("drm/i915/cmdparser: Add support for backward jumps") Signed-off-by: Ben Hutchings Signed-off-by: Jon Bloomfield Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_cmd_parser.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 5c2ae816ac3217..e4b9eb1f6b6021 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -1374,7 +1374,7 @@ static void init_whitelist(struct i915_gem_context *ctx, u32 batch_len) return; if (batch_cmds <= ctx->jump_whitelist_cmds) { - memset(ctx->jump_whitelist, 0, exact_size * sizeof(u32)); + bitmap_zero(ctx->jump_whitelist, batch_cmds); return; } @@ -1394,8 +1394,7 @@ static void init_whitelist(struct i915_gem_context *ctx, u32 batch_len) } DRM_DEBUG("CMD: Failed to extend whitelist. BB_START may be disallowed\n"); - memset(ctx->jump_whitelist, 0, - BITS_TO_LONGS(ctx->jump_whitelist_cmds) * sizeof(u32)); + bitmap_zero(ctx->jump_whitelist, ctx->jump_whitelist_cmds); return; } From f570b657264ba55b336f65ab689eea096c0fef39 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 19 Aug 2019 17:24:07 +0200 Subject: [PATCH 129/155] KVM: x86: use Intel speculation bugs and features as derived in generic x86 code commit 0c54914d0c52a15db9954a76ce80fee32cf318f4 upstream. Similar to AMD bits, set the Intel bits from the vendor-independent feature and bug flags, because KVM_GET_SUPPORTED_CPUID does not care about the vendor and they should be set on AMD processors as well. Suggested-by: Jim Mattson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/cpuid.c | 8 ++++++++ arch/x86/kvm/x86.c | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 5c82b4bc4a68a0..33f87b69648732 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -481,8 +481,16 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* PKU is not yet implemented for shadow paging. */ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) entry->ecx &= ~F(PKU); + entry->edx &= kvm_cpuid_7_0_edx_x86_features; cpuid_mask(&entry->edx, CPUID_7_EDX); + if (boot_cpu_has(X86_FEATURE_IBPB) && + boot_cpu_has(X86_FEATURE_IBRS)) + entry->edx |= F(SPEC_CTRL); + if (boot_cpu_has(X86_FEATURE_STIBP)) + entry->edx |= F(INTEL_STIBP); + if (boot_cpu_has(X86_FEATURE_SSBD)) + entry->edx |= F(SPEC_CTRL_SSBD); /* * We emulate ARCH_CAPABILITIES in software even * if the host doesn't support it. diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4927d0f5be131f..b1e0969a4543c6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1081,8 +1081,16 @@ u64 kvm_get_arch_capabilities(void) if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + data |= ARCH_CAP_RDCL_NO; + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + data |= ARCH_CAP_SSB_NO; + if (!boot_cpu_has_bug(X86_BUG_MDS)) + data |= ARCH_CAP_MDS_NO; + return data; } + EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities); static int kvm_get_msr_feature(struct kvm_msr_entry *msr) From 74bccd0ec712fdff716cec80ba1553d41bc887b8 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 23 Oct 2019 10:45:50 +0200 Subject: [PATCH 130/155] x86/msr: Add the IA32_TSX_CTRL MSR commit c2955f270a84762343000f103e0640d29c7a96f3 upstream. Transactional Synchronization Extensions (TSX) may be used on certain processors as part of a speculative side channel attack. A microcode update for existing processors that are vulnerable to this attack will add a new MSR - IA32_TSX_CTRL to allow the system administrator the option to disable TSX as one of the possible mitigations. The CPUs which get this new MSR after a microcode upgrade are the ones which do not set MSR_IA32_ARCH_CAPABILITIES.MDS_NO (bit 5) because those CPUs have CPUID.MD_CLEAR, i.e., the VERW implementation which clears all CPU buffers takes care of the TAA case as well. [ Note that future processors that are not vulnerable will also support the IA32_TSX_CTRL MSR. ] Add defines for the new IA32_TSX_CTRL MSR and its bits. TSX has two sub-features: 1. Restricted Transactional Memory (RTM) is an explicitly-used feature where new instructions begin and end TSX transactions. 2. Hardware Lock Elision (HLE) is implicitly used when certain kinds of "old" style locks are used by software. Bit 7 of the IA32_ARCH_CAPABILITIES indicates the presence of the IA32_TSX_CTRL MSR. There are two control bits in IA32_TSX_CTRL MSR: Bit 0: When set, it disables the Restricted Transactional Memory (RTM) sub-feature of TSX (will force all transactions to abort on the XBEGIN instruction). Bit 1: When set, it disables the enumeration of the RTM and HLE feature (i.e. it will make CPUID(EAX=7).EBX{bit4} and CPUID(EAX=7).EBX{bit11} read as 0). The other TSX sub-feature, Hardware Lock Elision (HLE), is unconditionally disabled by the new microcode but still enumerated as present by CPUID(EAX=7).EBX{bit4}, unless disabled by IA32_TSX_CTRL_MSR[1] - TSX_CTRL_CPUID_CLEAR. Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Neelima Krishnan Reviewed-by: Mark Gross Reviewed-by: Tony Luck Reviewed-by: Josh Poimboeuf Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/msr-index.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index fda3bf75de6c38..127fc249def73d 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -84,6 +84,7 @@ * Microarchitectural Data * Sampling (MDS) vulnerabilities. */ +#define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* @@ -94,6 +95,10 @@ #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e +#define MSR_IA32_TSX_CTRL 0x00000122 +#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ +#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */ + #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 #define MSR_IA32_SYSENTER_EIP 0x00000176 From 52bd862adee5c3ab7125bb7309f23bccf0ffd691 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 23 Oct 2019 10:52:35 +0200 Subject: [PATCH 131/155] x86/cpu: Add a helper function x86_read_arch_cap_msr() commit 286836a70433fb64131d2590f4bf512097c255e1 upstream. Add a helper function to read the IA32_ARCH_CAPABILITIES MSR. Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Neelima Krishnan Reviewed-by: Mark Gross Reviewed-by: Tony Luck Reviewed-by: Josh Poimboeuf Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/common.c | 15 +++++++++++---- arch/x86/kernel/cpu/cpu.h | 2 ++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 551c6bed7c8ce4..e6edbe5d2f8174 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -968,19 +968,26 @@ static bool __init cpu_matches(unsigned long which) return m && !!(m->driver_data & which); } -static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +u64 x86_read_arch_cap_msr(void) { u64 ia32_cap = 0; + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + + return ia32_cap; +} + +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + if (cpu_matches(NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); - if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index cca588407dca80..f28c6cd056f9a6 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -54,4 +54,6 @@ unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); +extern u64 x86_read_arch_cap_msr(void); + #endif /* ARCH_X86_CPU_H */ From 4b708ea4e5e772747b89619489ab96e9d1a1a44d Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 23 Oct 2019 11:01:53 +0200 Subject: [PATCH 132/155] x86/cpu: Add a "tsx=" cmdline option with TSX disabled by default commit 95c5824f75f3ba4c9e8e5a4b1a623c95390ac266 upstream. Add a kernel cmdline parameter "tsx" to control the Transactional Synchronization Extensions (TSX) feature. On CPUs that support TSX control, use "tsx=on|off" to enable or disable TSX. Not specifying this option is equivalent to "tsx=off". This is because on certain processors TSX may be used as a part of a speculative side channel attack. Carve out the TSX controlling functionality into a separate compilation unit because TSX is a CPU feature while the TSX async abort control machinery will go to cpu/bugs.c. [ bp: - Massage, shorten and clear the arg buffer. - Clarifications of the tsx= possible options - Josh. - Expand on TSX_CTRL availability - Pawan. ] Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Signed-off-by: Greg Kroah-Hartman --- .../admin-guide/kernel-parameters.txt | 26 ++++ arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/cpu/cpu.h | 16 +++ arch/x86/kernel/cpu/intel.c | 5 + arch/x86/kernel/cpu/tsx.c | 125 ++++++++++++++++++ 6 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/tsx.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 671f518b09eeb0..01463ac89a0024 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4505,6 +4505,32 @@ platforms where RDTSC is slow and this accounting can add overhead. + tsx= [X86] Control Transactional Synchronization + Extensions (TSX) feature in Intel processors that + support TSX control. + + This parameter controls the TSX feature. The options are: + + on - Enable TSX on the system. Although there are + mitigations for all known security vulnerabilities, + TSX has been known to be an accelerator for + several previous speculation-related CVEs, and + so there may be unknown security risks associated + with leaving it enabled. + + off - Disable TSX on the system. (Note that this + option takes effect only on newer CPUs which are + not vulnerable to MDS, i.e., have + MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 and which get + the new IA32_TSX_CTRL MSR through a microcode + update. This new MSR allows for the reliable + deactivation of the TSX functionality.) + + Not specifying this option is equivalent to tsx=off. + + See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst + for more details. + turbografx.map[2|3]= [HW,JOY] TurboGraFX parallel port interface Format: diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 570e8bb1f386d9..e13ddd19a76c50 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -28,7 +28,7 @@ obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o tsx.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e6edbe5d2f8174..255ab4d8a3c613 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1414,6 +1414,7 @@ void __init identify_boot_cpu(void) enable_sep_cpu(); #endif cpu_detect_tlb(&boot_cpu_data); + tsx_init(); } void identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index f28c6cd056f9a6..db10a63687d346 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -45,6 +45,22 @@ struct _tlb_table { extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; +#ifdef CONFIG_CPU_SUP_INTEL +enum tsx_ctrl_states { + TSX_CTRL_ENABLE, + TSX_CTRL_DISABLE, + TSX_CTRL_NOT_SUPPORTED, +}; + +extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; + +extern void __init tsx_init(void); +extern void tsx_enable(void); +extern void tsx_disable(void); +#else +static inline void tsx_init(void) { } +#endif /* CONFIG_CPU_SUP_INTEL */ + extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); extern int detect_extended_topology_early(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 574dcdc092abdc..3a5ea741701b0c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -695,6 +695,11 @@ static void init_intel(struct cpuinfo_x86 *c) init_intel_energy_perf(c); init_intel_misc_features(c); + + if (tsx_ctrl_state == TSX_CTRL_ENABLE) + tsx_enable(); + if (tsx_ctrl_state == TSX_CTRL_DISABLE) + tsx_disable(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c new file mode 100644 index 00000000000000..04471c4378d815 --- /dev/null +++ b/arch/x86/kernel/cpu/tsx.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Transactional Synchronization Extensions (TSX) control. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Pawan Gupta + */ + +#include + +#include + +#include "cpu.h" + +enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; + +void tsx_disable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Force all transactions to immediately abort */ + tsx |= TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is not enumerated in CPUID. + * This is visible to userspace and will ensure they + * do not waste resources trying TSX transactions that + * will always abort. + */ + tsx |= TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +void tsx_enable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Enable the RTM feature in the cpu */ + tsx &= ~TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is enumerated in CPUID. + * This is visible to userspace and will ensure they + * can enumerate and use the TSX feature. + */ + tsx &= ~TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +static bool __init tsx_ctrl_is_supported(void) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. + * + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, + * tsx= cmdline requests will do nothing on CPUs without + * MSR_IA32_TSX_CTRL support. + */ + return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); +} + +void __init tsx_init(void) +{ + char arg[4] = {}; + int ret; + + if (!tsx_ctrl_is_supported()) + return; + + ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); + if (ret >= 0) { + if (!strcmp(arg, "on")) { + tsx_ctrl_state = TSX_CTRL_ENABLE; + } else if (!strcmp(arg, "off")) { + tsx_ctrl_state = TSX_CTRL_DISABLE; + } else { + tsx_ctrl_state = TSX_CTRL_DISABLE; + pr_err("tsx: invalid option, defaulting to off\n"); + } + } else { + /* tsx= not provided, defaulting to off */ + tsx_ctrl_state = TSX_CTRL_DISABLE; + } + + if (tsx_ctrl_state == TSX_CTRL_DISABLE) { + tsx_disable(); + + /* + * tsx_disable() will change the state of the + * RTM CPUID bit. Clear it here since it is now + * expected to be not set. + */ + setup_clear_cpu_cap(X86_FEATURE_RTM); + } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) { + + /* + * HW defaults TSX to be enabled at bootup. + * We may still need the TSX enable support + * during init for special cases like + * kexec after TSX is disabled. + */ + tsx_enable(); + + /* + * tsx_enable() will change the state of the + * RTM CPUID bit. Force it here since it is now + * expected to be set. + */ + setup_force_cpu_cap(X86_FEATURE_RTM); + } +} From 9a5757771b81203d6ff9c170b2a24ae9e447da08 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 23 Oct 2019 11:30:45 +0200 Subject: [PATCH 133/155] x86/speculation/taa: Add mitigation for TSX Async Abort commit 1b42f017415b46c317e71d41c34ec088417a1883 upstream. TSX Async Abort (TAA) is a side channel vulnerability to the internal buffers in some Intel processors similar to Microachitectural Data Sampling (MDS). In this case, certain loads may speculatively pass invalid data to dependent operations when an asynchronous abort condition is pending in a TSX transaction. This includes loads with no fault or assist condition. Such loads may speculatively expose stale data from the uarch data structures as in MDS. Scope of exposure is within the same-thread and cross-thread. This issue affects all current processors that support TSX, but do not have ARCH_CAP_TAA_NO (bit 8) set in MSR_IA32_ARCH_CAPABILITIES. On CPUs which have their IA32_ARCH_CAPABILITIES MSR bit MDS_NO=0, CPUID.MD_CLEAR=1 and the MDS mitigation is clearing the CPU buffers using VERW or L1D_FLUSH, there is no additional mitigation needed for TAA. On affected CPUs with MDS_NO=1 this issue can be mitigated by disabling the Transactional Synchronization Extensions (TSX) feature. A new MSR IA32_TSX_CTRL in future and current processors after a microcode update can be used to control the TSX feature. There are two bits in that MSR: * TSX_CTRL_RTM_DISABLE disables the TSX sub-feature Restricted Transactional Memory (RTM). * TSX_CTRL_CPUID_CLEAR clears the RTM enumeration in CPUID. The other TSX sub-feature, Hardware Lock Elision (HLE), is unconditionally disabled with updated microcode but still enumerated as present by CPUID(EAX=7).EBX{bit4}. The second mitigation approach is similar to MDS which is clearing the affected CPU buffers on return to user space and when entering a guest. Relevant microcode update is required for the mitigation to work. More details on this approach can be found here: https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html The TSX feature can be controlled by the "tsx" command line parameter. If it is force-enabled then "Clear CPU buffers" (MDS mitigation) is deployed. The effective mitigation state can be read from sysfs. [ bp: - massage + comments cleanup - s/TAA_MITIGATION_TSX_DISABLE/TAA_MITIGATION_TSX_DISABLED/g - Josh. - remove partial TAA mitigation in update_mds_branch_idle() - Josh. - s/tsx_async_abort_cmdline/tsx_async_abort_parse_cmdline/g ] Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 4 + arch/x86/include/asm/nospec-branch.h | 4 +- arch/x86/include/asm/processor.h | 7 ++ arch/x86/kernel/cpu/bugs.c | 110 +++++++++++++++++++++++++++ arch/x86/kernel/cpu/common.c | 15 ++++ 6 files changed, 139 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 14357354cd289a..46a4d5f4a77c4f 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -388,5 +388,6 @@ #define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ #define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ +#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 127fc249def73d..2ff54c96576dcc 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -85,6 +85,10 @@ * Sampling (MDS) vulnerabilities. */ #define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */ +#define ARCH_CAP_TAA_NO BIT(8) /* + * Not susceptible to + * TSX Async Abort (TAA) vulnerabilities. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index f6b496d110972c..b73a16a56e4f21 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -323,7 +323,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); #include /** - * mds_clear_cpu_buffers - Mitigation for MDS vulnerability + * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability * * This uses the otherwise unused and obsolete VERW instruction in * combination with microcode which triggers a CPU buffer flush when the @@ -346,7 +346,7 @@ static inline void mds_clear_cpu_buffers(void) } /** - * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability + * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability * * Clear CPU buffers if the corresponding static key is enabled */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d55a0adbcf270f..6a87eda9691e4f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -994,4 +994,11 @@ enum mds_mitigations { MDS_MITIGATION_VMWERV, }; +enum taa_mitigations { + TAA_MITIGATION_OFF, + TAA_MITIGATION_UCODE_NEEDED, + TAA_MITIGATION_VERW, + TAA_MITIGATION_TSX_DISABLED, +}; + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 8bf21bc7a19039..8b219d30c3810d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -32,11 +32,14 @@ #include #include +#include "cpu.h" + static void __init spectre_v1_select_mitigation(void); static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); +static void __init taa_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -103,6 +106,7 @@ void __init check_bugs(void) ssb_select_mitigation(); l1tf_select_mitigation(); mds_select_mitigation(); + taa_select_mitigation(); arch_smt_update(); @@ -266,6 +270,100 @@ static int __init mds_cmdline(char *str) } early_param("mds", mds_cmdline); +#undef pr_fmt +#define pr_fmt(fmt) "TAA: " fmt + +/* Default mitigation for TAA-affected CPUs */ +static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; +static bool taa_nosmt __ro_after_init; + +static const char * const taa_strings[] = { + [TAA_MITIGATION_OFF] = "Vulnerable", + [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", + [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", +}; + +static void __init taa_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* TSX previously disabled by tsx=off */ + if (!boot_cpu_has(X86_FEATURE_RTM)) { + taa_mitigation = TAA_MITIGATION_TSX_DISABLED; + goto out; + } + + if (cpu_mitigations_off()) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* TAA mitigation is turned off on the cmdline (tsx_async_abort=off) */ + if (taa_mitigation == TAA_MITIGATION_OFF) + goto out; + + if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_VERW; + else + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ( (ia32_cap & ARCH_CAP_MDS_NO) && + !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + static_branch_enable(&mds_user_clear); + + if (taa_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); + +out: + pr_info("%s\n", taa_strings[taa_mitigation]); +} + +static int __init tsx_async_abort_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_TAA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + taa_mitigation = TAA_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + taa_mitigation = TAA_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_nosmt = true; + } + + return 0; +} +early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); + #undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt @@ -772,6 +870,7 @@ static void update_mds_branch_idle(void) } #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" void arch_smt_update(void) { @@ -804,6 +903,17 @@ void arch_smt_update(void) break; } + switch (taa_mitigation) { + case TAA_MITIGATION_VERW: + case TAA_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(TAA_MSG_SMT); + break; + case TAA_MITIGATION_TSX_DISABLED: + case TAA_MITIGATION_OFF: + break; + } + mutex_unlock(&spec_ctrl_mutex); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 255ab4d8a3c613..8f5059c7b2a46d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1004,6 +1004,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (!cpu_matches(NO_SWAPGS)) setup_force_cpu_bug(X86_BUG_SWAPGS); + /* + * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when: + * - TSX is supported or + * - TSX_CTRL is present + * + * TSX_CTRL check is needed for cases when TSX could be disabled before + * the kernel boot e.g. kexec. + * TSX_CTRL check alone is not sufficient for cases when the microcode + * update is not present or running as guest that don't get TSX_CTRL. + */ + if (!(ia32_cap & ARCH_CAP_TAA_NO) && + (cpu_has(c, X86_FEATURE_RTM) || + (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + setup_force_cpu_bug(X86_BUG_TAA); + if (cpu_matches(NO_MELTDOWN)) return; From 79373f485f7be07cb097a49aa200f80914407ae9 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 23 Oct 2019 12:19:51 +0200 Subject: [PATCH 134/155] x86/speculation/taa: Add sysfs reporting for TSX Async Abort commit 6608b45ac5ecb56f9e171252229c39580cc85f0f upstream. Add the sysfs reporting file for TSX Async Abort. It exposes the vulnerability and the mitigation state similar to the existing files for the other hardware vulnerabilities. Sysfs file path is: /sys/devices/system/cpu/vulnerabilities/tsx_async_abort Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Neelima Krishnan Reviewed-by: Mark Gross Reviewed-by: Tony Luck Reviewed-by: Greg Kroah-Hartman Reviewed-by: Josh Poimboeuf Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 23 +++++++++++++++++++++++ drivers/base/cpu.c | 9 +++++++++ include/linux/cpu.h | 3 +++ 3 files changed, 35 insertions(+) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 8b219d30c3810d..507aa25176c15a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1408,6 +1408,21 @@ static ssize_t mds_show_state(char *buf) sched_smt_active() ? "vulnerable" : "disabled"); } +static ssize_t tsx_async_abort_show_state(char *buf) +{ + if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || + (taa_mitigation == TAA_MITIGATION_OFF)) + return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sprintf(buf, "%s; SMT Host state unknown\n", + taa_strings[taa_mitigation]); + } + + return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + static char *stibp_state(void) { if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) @@ -1473,6 +1488,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_MDS: return mds_show_state(buf); + case X86_BUG_TAA: + return tsx_async_abort_show_state(buf); + default: break; } @@ -1509,4 +1527,9 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu { return cpu_show_common(dev, attr, buf, X86_BUG_MDS); } + +ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_TAA); +} #endif diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 32b52e6bd13b73..0cd78375cd5e98 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -539,12 +539,20 @@ ssize_t __weak cpu_show_mds(struct device *dev, return sprintf(buf, "Not affected\n"); } +ssize_t __weak cpu_show_tsx_async_abort(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); +static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -553,6 +561,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_spec_store_bypass.attr, &dev_attr_l1tf.attr, &dev_attr_mds.attr, + &dev_attr_tsx_async_abort.attr, NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index efc48efb0ec6ae..851208e7aa131e 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -59,6 +59,9 @@ extern ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_tsx_async_abort(struct device *dev, + struct device_attribute *attr, + char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, From d2f9854c2c34552e2b8f5a01eb1ac66796de89b6 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 23 Oct 2019 12:23:33 +0200 Subject: [PATCH 135/155] kvm/x86: Export MDS_NO=0 to guests when TSX is enabled commit e1d38b63acd843cfdd4222bf19a26700fd5c699e upstream. Export the IA32_ARCH_CAPABILITIES MSR bit MDS_NO=0 to guests on TSX Async Abort(TAA) affected hosts that have TSX enabled and updated microcode. This is required so that the guests don't complain, "Vulnerable: Clear CPU buffers attempted, no microcode" when the host has the updated microcode to clear CPU buffers. Microcode update also adds support for MSR_IA32_TSX_CTRL which is enumerated by the ARCH_CAP_TSX_CTRL bit in IA32_ARCH_CAPABILITIES MSR. Guests can't do this check themselves when the ARCH_CAP_TSX_CTRL bit is not exported to the guests. In this case export MDS_NO=0 to the guests. When guests have CPUID.MD_CLEAR=1, they deploy MDS mitigation which also mitigates TAA. Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Neelima Krishnan Reviewed-by: Tony Luck Reviewed-by: Josh Poimboeuf Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/x86.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b1e0969a4543c6..461c6ca2cea5e7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1088,6 +1088,25 @@ u64 kvm_get_arch_capabilities(void) if (!boot_cpu_has_bug(X86_BUG_MDS)) data |= ARCH_CAP_MDS_NO; + /* + * On TAA affected systems, export MDS_NO=0 when: + * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1. + * - Updated microcode is present. This is detected by + * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures + * that VERW clears CPU buffers. + * + * When MDS_NO=0 is exported, guests deploy clear CPU buffer + * mitigation and don't complain: + * + * "Vulnerable: Clear CPU buffers attempted, no microcode" + * + * If TSX is disabled on the system, guests are also mitigated against + * TAA and clear CPU buffer mitigation is not required for guests. + */ + if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) && + (data & ARCH_CAP_TSX_CTRL_MSR)) + data &= ~ARCH_CAP_MDS_NO; + return data; } From 8c99df217f8e36fde46cbf2af50b5b191857d9d4 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 23 Oct 2019 12:28:57 +0200 Subject: [PATCH 136/155] x86/tsx: Add "auto" option to the tsx= cmdline parameter commit 7531a3596e3272d1f6841e0d601a614555dc6b65 upstream. Platforms which are not affected by X86_BUG_TAA may want the TSX feature enabled. Add "auto" option to the TSX cmdline parameter. When tsx=auto disable TSX when X86_BUG_TAA is present, otherwise enable TSX. More details on X86_BUG_TAA can be found here: https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html [ bp: Extend the arg buffer to accommodate "auto\0". ] Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Reviewed-by: Josh Poimboeuf Signed-off-by: Greg Kroah-Hartman --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ arch/x86/kernel/cpu/tsx.c | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 01463ac89a0024..3cae24bff0a12b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4526,6 +4526,9 @@ update. This new MSR allows for the reliable deactivation of the TSX functionality.) + auto - Disable TSX if X86_BUG_TAA is present, + otherwise enable TSX on the system. + Not specifying this option is equivalent to tsx=off. See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index 04471c4378d815..dda328ec2ba19e 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -75,7 +75,7 @@ static bool __init tsx_ctrl_is_supported(void) void __init tsx_init(void) { - char arg[4] = {}; + char arg[5] = {}; int ret; if (!tsx_ctrl_is_supported()) @@ -87,6 +87,11 @@ void __init tsx_init(void) tsx_ctrl_state = TSX_CTRL_ENABLE; } else if (!strcmp(arg, "off")) { tsx_ctrl_state = TSX_CTRL_DISABLE; + } else if (!strcmp(arg, "auto")) { + if (boot_cpu_has_bug(X86_BUG_TAA)) + tsx_ctrl_state = TSX_CTRL_DISABLE; + else + tsx_ctrl_state = TSX_CTRL_ENABLE; } else { tsx_ctrl_state = TSX_CTRL_DISABLE; pr_err("tsx: invalid option, defaulting to off\n"); From a4f14d5a0795fe7c4f75d31ef4abf816570e3872 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 23 Oct 2019 12:32:55 +0200 Subject: [PATCH 137/155] x86/speculation/taa: Add documentation for TSX Async Abort commit a7a248c593e4fd7a67c50b5f5318fe42a0db335e upstream. Add the documenation for TSX Async Abort. Include the description of the issue, how to check the mitigation state, control the mitigation, guidance for system administrators. [ bp: Add proper SPDX tags, touch ups by Josh and me. ] Co-developed-by: Antonio Gomez Iglesias Signed-off-by: Pawan Gupta Signed-off-by: Antonio Gomez Iglesias Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Mark Gross Reviewed-by: Tony Luck Reviewed-by: Josh Poimboeuf Signed-off-by: Greg Kroah-Hartman --- .../ABI/testing/sysfs-devices-system-cpu | 1 + Documentation/admin-guide/hw-vuln/index.rst | 1 + .../admin-guide/hw-vuln/tsx_async_abort.rst | 276 ++++++++++++++++++ .../admin-guide/kernel-parameters.txt | 38 +++ Documentation/x86/index.rst | 1 + Documentation/x86/tsx_async_abort.rst | 117 ++++++++ 6 files changed, 434 insertions(+) create mode 100644 Documentation/admin-guide/hw-vuln/tsx_async_abort.rst create mode 100644 Documentation/x86/tsx_async_abort.rst diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 645687b1870de2..0fffc1c66da163 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -381,6 +381,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/spec_store_bypass /sys/devices/system/cpu/vulnerabilities/l1tf /sys/devices/system/cpu/vulnerabilities/mds + /sys/devices/system/cpu/vulnerabilities/tsx_async_abort Date: January 2018 Contact: Linux kernel mailing list Description: Information about CPU vulnerabilities diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 49311f3da6f24b..0802b1c67452ac 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -12,3 +12,4 @@ are configurable at compile, boot or run time. spectre l1tf mds + tsx_async_abort diff --git a/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst new file mode 100644 index 00000000000000..fddbd7579c5351 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst @@ -0,0 +1,276 @@ +.. SPDX-License-Identifier: GPL-2.0 + +TAA - TSX Asynchronous Abort +====================================== + +TAA is a hardware vulnerability that allows unprivileged speculative access to +data which is available in various CPU internal buffers by using asynchronous +aborts within an Intel TSX transactional region. + +Affected processors +------------------- + +This vulnerability only affects Intel processors that support Intel +Transactional Synchronization Extensions (TSX) when the TAA_NO bit (bit 8) +is 0 in the IA32_ARCH_CAPABILITIES MSR. On processors where the MDS_NO bit +(bit 5) is 0 in the IA32_ARCH_CAPABILITIES MSR, the existing MDS mitigations +also mitigate against TAA. + +Whether a processor is affected or not can be read out from the TAA +vulnerability file in sysfs. See :ref:`tsx_async_abort_sys_info`. + +Related CVEs +------------ + +The following CVE entry is related to this TAA issue: + + ============== ===== =================================================== + CVE-2019-11135 TAA TSX Asynchronous Abort (TAA) condition on some + microprocessors utilizing speculative execution may + allow an authenticated user to potentially enable + information disclosure via a side channel with + local access. + ============== ===== =================================================== + +Problem +------- + +When performing store, load or L1 refill operations, processors write +data into temporary microarchitectural structures (buffers). The data in +those buffers can be forwarded to load operations as an optimization. + +Intel TSX is an extension to the x86 instruction set architecture that adds +hardware transactional memory support to improve performance of multi-threaded +software. TSX lets the processor expose and exploit concurrency hidden in an +application due to dynamically avoiding unnecessary synchronization. + +TSX supports atomic memory transactions that are either committed (success) or +aborted. During an abort, operations that happened within the transactional region +are rolled back. An asynchronous abort takes place, among other options, when a +different thread accesses a cache line that is also used within the transactional +region when that access might lead to a data race. + +Immediately after an uncompleted asynchronous abort, certain speculatively +executed loads may read data from those internal buffers and pass it to dependent +operations. This can be then used to infer the value via a cache side channel +attack. + +Because the buffers are potentially shared between Hyper-Threads cross +Hyper-Thread attacks are possible. + +The victim of a malicious actor does not need to make use of TSX. Only the +attacker needs to begin a TSX transaction and raise an asynchronous abort +which in turn potenitally leaks data stored in the buffers. + +More detailed technical information is available in the TAA specific x86 +architecture section: :ref:`Documentation/x86/tsx_async_abort.rst `. + + +Attack scenarios +---------------- + +Attacks against the TAA vulnerability can be implemented from unprivileged +applications running on hosts or guests. + +As for MDS, the attacker has no control over the memory addresses that can +be leaked. Only the victim is responsible for bringing data to the CPU. As +a result, the malicious actor has to sample as much data as possible and +then postprocess it to try to infer any useful information from it. + +A potential attacker only has read access to the data. Also, there is no direct +privilege escalation by using this technique. + + +.. _tsx_async_abort_sys_info: + +TAA system information +----------------------- + +The Linux kernel provides a sysfs interface to enumerate the current TAA status +of mitigated systems. The relevant sysfs file is: + +/sys/devices/system/cpu/vulnerabilities/tsx_async_abort + +The possible values in this file are: + +.. list-table:: + + * - 'Vulnerable' + - The CPU is affected by this vulnerability and the microcode and kernel mitigation are not applied. + * - 'Vulnerable: Clear CPU buffers attempted, no microcode' + - The system tries to clear the buffers but the microcode might not support the operation. + * - 'Mitigation: Clear CPU buffers' + - The microcode has been updated to clear the buffers. TSX is still enabled. + * - 'Mitigation: TSX disabled' + - TSX is disabled. + * - 'Not affected' + - The CPU is not affected by this issue. + +.. _ucode_needed: + +Best effort mitigation mode +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If the processor is vulnerable, but the availability of the microcode-based +mitigation mechanism is not advertised via CPUID the kernel selects a best +effort mitigation mode. This mode invokes the mitigation instructions +without a guarantee that they clear the CPU buffers. + +This is done to address virtualization scenarios where the host has the +microcode update applied, but the hypervisor is not yet updated to expose the +CPUID to the guest. If the host has updated microcode the protection takes +effect; otherwise a few CPU cycles are wasted pointlessly. + +The state in the tsx_async_abort sysfs file reflects this situation +accordingly. + + +Mitigation mechanism +-------------------- + +The kernel detects the affected CPUs and the presence of the microcode which is +required. If a CPU is affected and the microcode is available, then the kernel +enables the mitigation by default. + + +The mitigation can be controlled at boot time via a kernel command line option. +See :ref:`taa_mitigation_control_command_line`. + +.. _virt_mechanism: + +Virtualization mitigation +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Affected systems where the host has TAA microcode and TAA is mitigated by +having disabled TSX previously, are not vulnerable regardless of the status +of the VMs. + +In all other cases, if the host either does not have the TAA microcode or +the kernel is not mitigated, the system might be vulnerable. + + +.. _taa_mitigation_control_command_line: + +Mitigation control on the kernel command line +--------------------------------------------- + +The kernel command line allows to control the TAA mitigations at boot time with +the option "tsx_async_abort=". The valid arguments for this option are: + + ============ ============================================================= + off This option disables the TAA mitigation on affected platforms. + If the system has TSX enabled (see next parameter) and the CPU + is affected, the system is vulnerable. + + full TAA mitigation is enabled. If TSX is enabled, on an affected + system it will clear CPU buffers on ring transitions. On + systems which are MDS-affected and deploy MDS mitigation, + TAA is also mitigated. Specifying this option on those + systems will have no effect. + + full,nosmt The same as tsx_async_abort=full, with SMT disabled on + vulnerable CPUs that have TSX enabled. This is the complete + mitigation. When TSX is disabled, SMT is not disabled because + CPU is not vulnerable to cross-thread TAA attacks. + ============ ============================================================= + +Not specifying this option is equivalent to "tsx_async_abort=full". + +The kernel command line also allows to control the TSX feature using the +parameter "tsx=" on CPUs which support TSX control. MSR_IA32_TSX_CTRL is used +to control the TSX feature and the enumeration of the TSX feature bits (RTM +and HLE) in CPUID. + +The valid options are: + + ============ ============================================================= + off Disables TSX on the system. + + Note that this option takes effect only on newer CPUs which are + not vulnerable to MDS, i.e., have MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 + and which get the new IA32_TSX_CTRL MSR through a microcode + update. This new MSR allows for the reliable deactivation of + the TSX functionality. + + on Enables TSX. + + Although there are mitigations for all known security + vulnerabilities, TSX has been known to be an accelerator for + several previous speculation-related CVEs, and so there may be + unknown security risks associated with leaving it enabled. + + auto Disables TSX if X86_BUG_TAA is present, otherwise enables TSX + on the system. + ============ ============================================================= + +Not specifying this option is equivalent to "tsx=off". + +The following combinations of the "tsx_async_abort" and "tsx" are possible. For +affected platforms tsx=auto is equivalent to tsx=off and the result will be: + + ========= ========================== ========================================= + tsx=on tsx_async_abort=full The system will use VERW to clear CPU + buffers. Cross-thread attacks are still + possible on SMT machines. + tsx=on tsx_async_abort=full,nosmt As above, cross-thread attacks on SMT + mitigated. + tsx=on tsx_async_abort=off The system is vulnerable. + tsx=off tsx_async_abort=full TSX might be disabled if microcode + provides a TSX control MSR. If so, + system is not vulnerable. + tsx=off tsx_async_abort=full,nosmt Ditto + tsx=off tsx_async_abort=off ditto + ========= ========================== ========================================= + + +For unaffected platforms "tsx=on" and "tsx_async_abort=full" does not clear CPU +buffers. For platforms without TSX control (MSR_IA32_ARCH_CAPABILITIES.MDS_NO=0) +"tsx" command line argument has no effect. + +For the affected platforms below table indicates the mitigation status for the +combinations of CPUID bit MD_CLEAR and IA32_ARCH_CAPABILITIES MSR bits MDS_NO +and TSX_CTRL_MSR. + + ======= ========= ============= ======================================== + MDS_NO MD_CLEAR TSX_CTRL_MSR Status + ======= ========= ============= ======================================== + 0 0 0 Vulnerable (needs microcode) + 0 1 0 MDS and TAA mitigated via VERW + 1 1 0 MDS fixed, TAA vulnerable if TSX enabled + because MD_CLEAR has no meaning and + VERW is not guaranteed to clear buffers + 1 X 1 MDS fixed, TAA can be mitigated by + VERW or TSX_CTRL_MSR + ======= ========= ============= ======================================== + +Mitigation selection guide +-------------------------- + +1. Trusted userspace and guests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If all user space applications are from a trusted source and do not execute +untrusted code which is supplied externally, then the mitigation can be +disabled. The same applies to virtualized environments with trusted guests. + + +2. Untrusted userspace and guests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If there are untrusted applications or guests on the system, enabling TSX +might allow a malicious actor to leak data from the host or from other +processes running on the same physical core. + +If the microcode is available and the TSX is disabled on the host, attacks +are prevented in a virtualized environment as well, even if the VMs do not +explicitly enable the mitigation. + + +.. _taa_default_mitigations: + +Default mitigations +------------------- + +The kernel's default action for vulnerable processors is: + + - Deploy TSX disable mitigation (tsx_async_abort=full tsx=off). diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3cae24bff0a12b..d16b3d41ffe517 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2409,6 +2409,7 @@ ssbd=force-off [ARM64] l1tf=off [X86] mds=off [X86] + tsx_async_abort=off [X86] auto (default) Mitigate all CPU vulnerabilities, but leave SMT @@ -2424,6 +2425,7 @@ be fully mitigated, even if it means losing SMT. Equivalent to: l1tf=flush,nosmt [X86] mds=full,nosmt [X86] + tsx_async_abort=full,nosmt [X86] mminit_loglevel= [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this @@ -4534,6 +4536,42 @@ See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst for more details. + tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async + Abort (TAA) vulnerability. + + Similar to Micro-architectural Data Sampling (MDS) + certain CPUs that support Transactional + Synchronization Extensions (TSX) are vulnerable to an + exploit against CPU internal buffers which can forward + information to a disclosure gadget under certain + conditions. + + In vulnerable processors, the speculatively forwarded + data can be used in a cache side channel attack, to + access data to which the attacker does not have direct + access. + + This parameter controls the TAA mitigation. The + options are: + + full - Enable TAA mitigation on vulnerable CPUs + if TSX is enabled. + + full,nosmt - Enable TAA mitigation and disable SMT on + vulnerable CPUs. If TSX is disabled, SMT + is not disabled because CPU is not + vulnerable to cross-thread TAA attacks. + off - Unconditionally disable TAA mitigation + + Not specifying this option is equivalent to + tsx_async_abort=full. On CPUs which are MDS affected + and deploy MDS mitigation, TAA mitigation is not + required and doesn't provide any additional + mitigation. + + For details see: + Documentation/admin-guide/hw-vuln/tsx_async_abort.rst + turbografx.map[2|3]= [HW,JOY] TurboGraFX parallel port interface Format: diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst index ef389dcf1b1d3c..0780d55c5aa828 100644 --- a/Documentation/x86/index.rst +++ b/Documentation/x86/index.rst @@ -6,3 +6,4 @@ x86 architecture specifics :maxdepth: 1 mds + tsx_async_abort diff --git a/Documentation/x86/tsx_async_abort.rst b/Documentation/x86/tsx_async_abort.rst new file mode 100644 index 00000000000000..583ddc185ba220 --- /dev/null +++ b/Documentation/x86/tsx_async_abort.rst @@ -0,0 +1,117 @@ +.. SPDX-License-Identifier: GPL-2.0 + +TSX Async Abort (TAA) mitigation +================================ + +.. _tsx_async_abort: + +Overview +-------- + +TSX Async Abort (TAA) is a side channel attack on internal buffers in some +Intel processors similar to Microachitectural Data Sampling (MDS). In this +case certain loads may speculatively pass invalid data to dependent operations +when an asynchronous abort condition is pending in a Transactional +Synchronization Extensions (TSX) transaction. This includes loads with no +fault or assist condition. Such loads may speculatively expose stale data from +the same uarch data structures as in MDS, with same scope of exposure i.e. +same-thread and cross-thread. This issue affects all current processors that +support TSX. + +Mitigation strategy +------------------- + +a) TSX disable - one of the mitigations is to disable TSX. A new MSR +IA32_TSX_CTRL will be available in future and current processors after +microcode update which can be used to disable TSX. In addition, it +controls the enumeration of the TSX feature bits (RTM and HLE) in CPUID. + +b) Clear CPU buffers - similar to MDS, clearing the CPU buffers mitigates this +vulnerability. More details on this approach can be found in +:ref:`Documentation/admin-guide/hw-vuln/mds.rst `. + +Kernel internal mitigation modes +-------------------------------- + + ============= ============================================================ + off Mitigation is disabled. Either the CPU is not affected or + tsx_async_abort=off is supplied on the kernel command line. + + tsx disabled Mitigation is enabled. TSX feature is disabled by default at + bootup on processors that support TSX control. + + verw Mitigation is enabled. CPU is affected and MD_CLEAR is + advertised in CPUID. + + ucode needed Mitigation is enabled. CPU is affected and MD_CLEAR is not + advertised in CPUID. That is mainly for virtualization + scenarios where the host has the updated microcode but the + hypervisor does not expose MD_CLEAR in CPUID. It's a best + effort approach without guarantee. + ============= ============================================================ + +If the CPU is affected and the "tsx_async_abort" kernel command line parameter is +not provided then the kernel selects an appropriate mitigation depending on the +status of RTM and MD_CLEAR CPUID bits. + +Below tables indicate the impact of tsx=on|off|auto cmdline options on state of +TAA mitigation, VERW behavior and TSX feature for various combinations of +MSR_IA32_ARCH_CAPABILITIES bits. + +1. "tsx=off" + +========= ========= ============ ============ ============== =================== ====================== +MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=off +---------------------------------- ------------------------------------------------------------------------- +TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation + after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full +========= ========= ============ ============ ============== =================== ====================== + 0 0 0 HW default Yes Same as MDS Same as MDS + 0 0 1 Invalid case Invalid case Invalid case Invalid case + 0 1 0 HW default No Need ucode update Need ucode update + 0 1 1 Disabled Yes TSX disabled TSX disabled + 1 X 1 Disabled X None needed None needed +========= ========= ============ ============ ============== =================== ====================== + +2. "tsx=on" + +========= ========= ============ ============ ============== =================== ====================== +MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=on +---------------------------------- ------------------------------------------------------------------------- +TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation + after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full +========= ========= ============ ============ ============== =================== ====================== + 0 0 0 HW default Yes Same as MDS Same as MDS + 0 0 1 Invalid case Invalid case Invalid case Invalid case + 0 1 0 HW default No Need ucode update Need ucode update + 0 1 1 Enabled Yes None Same as MDS + 1 X 1 Enabled X None needed None needed +========= ========= ============ ============ ============== =================== ====================== + +3. "tsx=auto" + +========= ========= ============ ============ ============== =================== ====================== +MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=auto +---------------------------------- ------------------------------------------------------------------------- +TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation + after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full +========= ========= ============ ============ ============== =================== ====================== + 0 0 0 HW default Yes Same as MDS Same as MDS + 0 0 1 Invalid case Invalid case Invalid case Invalid case + 0 1 0 HW default No Need ucode update Need ucode update + 0 1 1 Disabled Yes TSX disabled TSX disabled + 1 X 1 Enabled X None needed None needed +========= ========= ============ ============ ============== =================== ====================== + +In the tables, TSX_CTRL_MSR is a new bit in MSR_IA32_ARCH_CAPABILITIES that +indicates whether MSR_IA32_TSX_CTRL is supported. + +There are two control bits in IA32_TSX_CTRL MSR: + + Bit 0: When set it disables the Restricted Transactional Memory (RTM) + sub-feature of TSX (will force all transactions to abort on the + XBEGIN instruction). + + Bit 1: When set it disables the enumeration of the RTM and HLE feature + (i.e. it will make CPUID(EAX=7).EBX{bit4} and + CPUID(EAX=7).EBX{bit11} read as 0). From aa6ca7b9a9b72eb61e43f44c978dd8e6d4b2f046 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 23 Oct 2019 12:35:50 +0200 Subject: [PATCH 138/155] x86/tsx: Add config options to set tsx=on|off|auto commit db616173d787395787ecc93eef075fa975227b10 upstream. There is a general consensus that TSX usage is not largely spread while the history shows there is a non trivial space for side channel attacks possible. Therefore the tsx is disabled by default even on platforms that might have a safe implementation of TSX according to the current knowledge. This is a fair trade off to make. There are, however, workloads that really do benefit from using TSX and updating to a newer kernel with TSX disabled might introduce a noticeable regressions. This would be especially a problem for Linux distributions which will provide TAA mitigations. Introduce config options X86_INTEL_TSX_MODE_OFF, X86_INTEL_TSX_MODE_ON and X86_INTEL_TSX_MODE_AUTO to control the TSX feature. The config setting can be overridden by the tsx cmdline options. [ bp: Text cleanups from Josh. ] Suggested-by: Borislav Petkov Signed-off-by: Michal Hocko Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Signed-off-by: Greg Kroah-Hartman --- arch/x86/Kconfig | 45 +++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/tsx.c | 22 +++++++++++++------ 2 files changed, 61 insertions(+), 6 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8fec1585ac7ab8..b58daecc591eb9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1853,6 +1853,51 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS If unsure, say y. +choice + prompt "TSX enable mode" + depends on CPU_SUP_INTEL + default X86_INTEL_TSX_MODE_OFF + help + Intel's TSX (Transactional Synchronization Extensions) feature + allows to optimize locking protocols through lock elision which + can lead to a noticeable performance boost. + + On the other hand it has been shown that TSX can be exploited + to form side channel attacks (e.g. TAA) and chances are there + will be more of those attacks discovered in the future. + + Therefore TSX is not enabled by default (aka tsx=off). An admin + might override this decision by tsx=on the command line parameter. + Even with TSX enabled, the kernel will attempt to enable the best + possible TAA mitigation setting depending on the microcode available + for the particular machine. + + This option allows to set the default tsx mode between tsx=on, =off + and =auto. See Documentation/admin-guide/kernel-parameters.txt for more + details. + + Say off if not sure, auto if TSX is in use but it should be used on safe + platforms or on if TSX is in use and the security aspect of tsx is not + relevant. + +config X86_INTEL_TSX_MODE_OFF + bool "off" + help + TSX is disabled if possible - equals to tsx=off command line parameter. + +config X86_INTEL_TSX_MODE_ON + bool "on" + help + TSX is always enabled on TSX capable HW - equals the tsx=on command + line parameter. + +config X86_INTEL_TSX_MODE_AUTO + bool "auto" + help + TSX is enabled on TSX capable HW that is believed to be safe against + side channel attacks- equals the tsx=auto command line parameter. +endchoice + config EFI bool "EFI runtime service support" depends on ACPI diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index dda328ec2ba19e..3e20d322bc98b6 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -73,6 +73,14 @@ static bool __init tsx_ctrl_is_supported(void) return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); } +static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) +{ + if (boot_cpu_has_bug(X86_BUG_TAA)) + return TSX_CTRL_DISABLE; + + return TSX_CTRL_ENABLE; +} + void __init tsx_init(void) { char arg[5] = {}; @@ -88,17 +96,19 @@ void __init tsx_init(void) } else if (!strcmp(arg, "off")) { tsx_ctrl_state = TSX_CTRL_DISABLE; } else if (!strcmp(arg, "auto")) { - if (boot_cpu_has_bug(X86_BUG_TAA)) - tsx_ctrl_state = TSX_CTRL_DISABLE; - else - tsx_ctrl_state = TSX_CTRL_ENABLE; + tsx_ctrl_state = x86_get_tsx_auto_mode(); } else { tsx_ctrl_state = TSX_CTRL_DISABLE; pr_err("tsx: invalid option, defaulting to off\n"); } } else { - /* tsx= not provided, defaulting to off */ - tsx_ctrl_state = TSX_CTRL_DISABLE; + /* tsx= not provided */ + if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) + tsx_ctrl_state = x86_get_tsx_auto_mode(); + else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) + tsx_ctrl_state = TSX_CTRL_DISABLE; + else + tsx_ctrl_state = TSX_CTRL_ENABLE; } if (tsx_ctrl_state == TSX_CTRL_DISABLE) { From 9b81a067dcfcdc2411338fcc110d90c412c92c76 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 6 Nov 2019 20:26:46 -0600 Subject: [PATCH 139/155] x86/speculation/taa: Fix printing of TAA_MSG_SMT on IBRS_ALL CPUs commit 012206a822a8b6ac09125bfaa210a95b9eb8f1c1 upstream. For new IBRS_ALL CPUs, the Enhanced IBRS check at the beginning of cpu_bugs_smt_update() causes the function to return early, unintentionally skipping the MDS and TAA logic. This is not a problem for MDS, because there appears to be no overlap between IBRS_ALL and MDS-affected CPUs. So the MDS mitigation would be disabled and nothing would need to be done in this function anyway. But for TAA, the TAA_MSG_SMT string will never get printed on Cascade Lake and newer. The check is superfluous anyway: when 'spectre_v2_enabled' is SPECTRE_V2_IBRS_ENHANCED, 'spectre_v2_user' is always SPECTRE_V2_USER_NONE, and so the 'spectre_v2_user' switch statement handles it appropriately by doing nothing. So just remove the check. Fixes: 1b42f017415b ("x86/speculation/taa: Add mitigation for TSX Async Abort") Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Reviewed-by: Tyler Hicks Reviewed-by: Borislav Petkov Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 507aa25176c15a..255b79df603c1e 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -874,10 +874,6 @@ static void update_mds_branch_idle(void) void arch_smt_update(void) { - /* Enhanced IBRS implies STIBP. No update required. */ - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) - return; - mutex_lock(&spec_ctrl_mutex); switch (spectre_v2_user) { From 56a0f3867c1bc40c6f155c780a284cc881a89488 Mon Sep 17 00:00:00 2001 From: Vineela Tummalapalli Date: Mon, 4 Nov 2019 12:22:01 +0100 Subject: [PATCH 140/155] x86/bugs: Add ITLB_MULTIHIT bug infrastructure commit db4d30fbb71b47e4ecb11c4efa5d8aad4b03dfae upstream. Some processors may incur a machine check error possibly resulting in an unrecoverable CPU lockup when an instruction fetch encounters a TLB multi-hit in the instruction TLB. This can occur when the page size is changed along with either the physical address or cache type. The relevant erratum can be found here: https://bugzilla.kernel.org/show_bug.cgi?id=205195 There are other processors affected for which the erratum does not fully disclose the impact. This issue affects both bare-metal x86 page tables and EPT. It can be mitigated by either eliminating the use of large pages or by using careful TLB invalidations when changing the page size in the page tables. Just like Spectre, Meltdown, L1TF and MDS, a new bit has been allocated in MSR_IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) and will be set on CPUs which are mitigated against this issue. Signed-off-by: Vineela Tummalapalli Co-developed-by: Pawan Gupta Signed-off-by: Pawan Gupta Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- .../ABI/testing/sysfs-devices-system-cpu | 1 + arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 7 +++ arch/x86/kernel/cpu/bugs.c | 13 ++++ arch/x86/kernel/cpu/common.c | 61 ++++++++++--------- drivers/base/cpu.c | 8 +++ include/linux/cpu.h | 2 + 7 files changed, 65 insertions(+), 28 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 0fffc1c66da163..9ebca6a750f33b 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -382,6 +382,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/l1tf /sys/devices/system/cpu/vulnerabilities/mds /sys/devices/system/cpu/vulnerabilities/tsx_async_abort + /sys/devices/system/cpu/vulnerabilities/itlb_multihit Date: January 2018 Contact: Linux kernel mailing list Description: Information about CPU vulnerabilities diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 46a4d5f4a77c4f..b4bef819d5d536 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -389,5 +389,6 @@ #define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ #define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ +#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 2ff54c96576dcc..5761a86b88e028 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -84,6 +84,13 @@ * Microarchitectural Data * Sampling (MDS) vulnerabilities. */ +#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /* + * The processor is not susceptible to a + * machine check error due to modifying the + * code page size along with either the + * physical address or cache type + * without TLB invalidation. + */ #define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */ #define ARCH_CAP_TAA_NO BIT(8) /* * Not susceptible to diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 255b79df603c1e..ee558f70d549d0 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1387,6 +1387,11 @@ static ssize_t l1tf_show_state(char *buf) } #endif +static ssize_t itlb_multihit_show_state(char *buf) +{ + return sprintf(buf, "Processor vulnerable\n"); +} + static ssize_t mds_show_state(char *buf) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { @@ -1487,6 +1492,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_TAA: return tsx_async_abort_show_state(buf); + case X86_BUG_ITLB_MULTIHIT: + return itlb_multihit_show_state(buf); + default: break; } @@ -1528,4 +1536,9 @@ ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *at { return cpu_show_common(dev, attr, buf, X86_BUG_TAA); } + +ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8f5059c7b2a46d..b8680f932a0712 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -897,13 +897,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) c->x86_cache_bits = c->x86_phys_bits; } -#define NO_SPECULATION BIT(0) -#define NO_MELTDOWN BIT(1) -#define NO_SSB BIT(2) -#define NO_L1TF BIT(3) -#define NO_MDS BIT(4) -#define MSBDS_ONLY BIT(5) -#define NO_SWAPGS BIT(6) +#define NO_SPECULATION BIT(0) +#define NO_MELTDOWN BIT(1) +#define NO_SSB BIT(2) +#define NO_L1TF BIT(3) +#define NO_MDS BIT(4) +#define MSBDS_ONLY BIT(5) +#define NO_SWAPGS BIT(6) +#define NO_ITLB_MULTIHIT BIT(7) #define VULNWL(_vendor, _family, _model, _whitelist) \ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } @@ -921,26 +922,26 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ - VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), - - VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(CORE_YONAH, NO_SSB), - VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -951,13 +952,13 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { */ /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), {} }; @@ -982,6 +983,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = x86_read_arch_cap_msr(); + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ + if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); + if (cpu_matches(NO_SPECULATION)) return; diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 0cd78375cd5e98..0272f66db5ac07 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -546,6 +546,12 @@ ssize_t __weak cpu_show_tsx_async_abort(struct device *dev, return sprintf(buf, "Not affected\n"); } +ssize_t __weak cpu_show_itlb_multihit(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); @@ -553,6 +559,7 @@ static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL); +static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -562,6 +569,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_l1tf.attr, &dev_attr_mds.attr, &dev_attr_tsx_async_abort.attr, + &dev_attr_itlb_multihit.attr, NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 851208e7aa131e..fde995ba122891 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -62,6 +62,8 @@ extern ssize_t cpu_show_mds(struct device *dev, extern ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_itlb_multihit(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, From 402c32059af1a308ff9eab0fb3666bb8a6e77c67 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 4 Nov 2019 12:22:01 +0100 Subject: [PATCH 141/155] x86/cpu: Add Tremont to the cpu vulnerability whitelist commit cad14885a8d32c1c0d8eaa7bf5c0152a22b6080e upstream. Add the new cpu family ATOM_TREMONT_D to the cpu vunerability whitelist. ATOM_TREMONT_D is not affected by X86_BUG_ITLB_MULTIHIT. ATOM_TREMONT_D might have mitigations against other issues as well, but only the ITLB multihit mitigation is confirmed at this point. Signed-off-by: Pawan Gupta Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/common.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b8680f932a0712..c0c9c5a44e82c8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -951,6 +951,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ + VULNWL_INTEL(ATOM_TREMONT_X, NO_ITLB_MULTIHIT), + /* AMD Family 0xf - 0x12 */ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), From fa8617fddad51c42f66e56cd14382ee0e9f9782e Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Mon, 4 Nov 2019 12:22:02 +0100 Subject: [PATCH 142/155] cpu/speculation: Uninline and export CPU mitigations helpers commit 731dc9df975a5da21237a18c3384f811a7a41cc6 upstream. A kernel module may need to check the value of the "mitigations=" kernel command line parameter as part of its setup when the module needs to perform software mitigations for a CPU flaw. Uninline and export the helper functions surrounding the cpu_mitigations enum to allow for their usage from a module. Lastly, privatize the enum and cpu_mitigations variable since the value of cpu_mitigations can be checked with the exported helper functions. Signed-off-by: Tyler Hicks Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- include/linux/cpu.h | 25 ++----------------------- kernel/cpu.c | 27 ++++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index fde995ba122891..67e8ba81c35f22 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -206,28 +206,7 @@ static inline int cpuhp_smt_enable(void) { return 0; } static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; } #endif -/* - * These are used for a global "mitigations=" cmdline option for toggling - * optional CPU mitigations. - */ -enum cpu_mitigations { - CPU_MITIGATIONS_OFF, - CPU_MITIGATIONS_AUTO, - CPU_MITIGATIONS_AUTO_NOSMT, -}; - -extern enum cpu_mitigations cpu_mitigations; - -/* mitigations=off */ -static inline bool cpu_mitigations_off(void) -{ - return cpu_mitigations == CPU_MITIGATIONS_OFF; -} - -/* mitigations=auto,nosmt */ -static inline bool cpu_mitigations_auto_nosmt(void) -{ - return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; -} +extern bool cpu_mitigations_off(void); +extern bool cpu_mitigations_auto_nosmt(void); #endif /* _LINUX_CPU_H_ */ diff --git a/kernel/cpu.c b/kernel/cpu.c index d768e15bef83b6..96f970d77339af 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2301,7 +2301,18 @@ void __init boot_cpu_hotplug_init(void) this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } -enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; +/* + * These are used for a global "mitigations=" cmdline option for toggling + * optional CPU mitigations. + */ +enum cpu_mitigations { + CPU_MITIGATIONS_OFF, + CPU_MITIGATIONS_AUTO, + CPU_MITIGATIONS_AUTO_NOSMT, +}; + +static enum cpu_mitigations cpu_mitigations __ro_after_init = + CPU_MITIGATIONS_AUTO; static int __init mitigations_parse_cmdline(char *arg) { @@ -2318,3 +2329,17 @@ static int __init mitigations_parse_cmdline(char *arg) return 0; } early_param("mitigations", mitigations_parse_cmdline); + +/* mitigations=off */ +bool cpu_mitigations_off(void) +{ + return cpu_mitigations == CPU_MITIGATIONS_OFF; +} +EXPORT_SYMBOL_GPL(cpu_mitigations_off); + +/* mitigations=auto,nosmt */ +bool cpu_mitigations_auto_nosmt(void) +{ + return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; +} +EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt); From cc5b0b7602f6f56cf6a03cbf091fc3ac2e4bb744 Mon Sep 17 00:00:00 2001 From: "Gomez Iglesias, Antonio" Date: Mon, 4 Nov 2019 12:22:03 +0100 Subject: [PATCH 143/155] Documentation: Add ITLB_MULTIHIT documentation commit 7f00cc8d4a51074eb0ad4c3f16c15757b1ddfb7d upstream. Add the initial ITLB_MULTIHIT documentation. [ tglx: Add it to the index so it gets actually built. ] Signed-off-by: Antonio Gomez Iglesias Signed-off-by: Nelson D'Souza Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- Documentation/admin-guide/hw-vuln/index.rst | 1 + .../admin-guide/hw-vuln/multihit.rst | 163 ++++++++++++++++++ 2 files changed, 164 insertions(+) create mode 100644 Documentation/admin-guide/hw-vuln/multihit.rst diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 0802b1c67452ac..0795e3c2643f29 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -13,3 +13,4 @@ are configurable at compile, boot or run time. l1tf mds tsx_async_abort + multihit.rst diff --git a/Documentation/admin-guide/hw-vuln/multihit.rst b/Documentation/admin-guide/hw-vuln/multihit.rst new file mode 100644 index 00000000000000..ba9988d8bce500 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/multihit.rst @@ -0,0 +1,163 @@ +iTLB multihit +============= + +iTLB multihit is an erratum where some processors may incur a machine check +error, possibly resulting in an unrecoverable CPU lockup, when an +instruction fetch hits multiple entries in the instruction TLB. This can +occur when the page size is changed along with either the physical address +or cache type. A malicious guest running on a virtualized system can +exploit this erratum to perform a denial of service attack. + + +Affected processors +------------------- + +Variations of this erratum are present on most Intel Core and Xeon processor +models. The erratum is not present on: + + - non-Intel processors + + - Some Atoms (Airmont, Bonnell, Goldmont, GoldmontPlus, Saltwell, Silvermont) + + - Intel processors that have the PSCHANGE_MC_NO bit set in the + IA32_ARCH_CAPABILITIES MSR. + + +Related CVEs +------------ + +The following CVE entry is related to this issue: + + ============== ================================================= + CVE-2018-12207 Machine Check Error Avoidance on Page Size Change + ============== ================================================= + + +Problem +------- + +Privileged software, including OS and virtual machine managers (VMM), are in +charge of memory management. A key component in memory management is the control +of the page tables. Modern processors use virtual memory, a technique that creates +the illusion of a very large memory for processors. This virtual space is split +into pages of a given size. Page tables translate virtual addresses to physical +addresses. + +To reduce latency when performing a virtual to physical address translation, +processors include a structure, called TLB, that caches recent translations. +There are separate TLBs for instruction (iTLB) and data (dTLB). + +Under this errata, instructions are fetched from a linear address translated +using a 4 KB translation cached in the iTLB. Privileged software modifies the +paging structure so that the same linear address using large page size (2 MB, 4 +MB, 1 GB) with a different physical address or memory type. After the page +structure modification but before the software invalidates any iTLB entries for +the linear address, a code fetch that happens on the same linear address may +cause a machine-check error which can result in a system hang or shutdown. + + +Attack scenarios +---------------- + +Attacks against the iTLB multihit erratum can be mounted from malicious +guests in a virtualized system. + + +iTLB multihit system information +-------------------------------- + +The Linux kernel provides a sysfs interface to enumerate the current iTLB +multihit status of the system:whether the system is vulnerable and which +mitigations are active. The relevant sysfs file is: + +/sys/devices/system/cpu/vulnerabilities/itlb_multihit + +The possible values in this file are: + +.. list-table:: + + * - Not affected + - The processor is not vulnerable. + * - KVM: Mitigation: Split huge pages + - Software changes mitigate this issue. + * - KVM: Vulnerable + - The processor is vulnerable, but no mitigation enabled + + +Enumeration of the erratum +-------------------------------- + +A new bit has been allocated in the IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) msr +and will be set on CPU's which are mitigated against this issue. + + ======================================= =========== =============================== + IA32_ARCH_CAPABILITIES MSR Not present Possibly vulnerable,check model + IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '0' Likely vulnerable,check model + IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '1' Not vulnerable + ======================================= =========== =============================== + + +Mitigation mechanism +------------------------- + +This erratum can be mitigated by restricting the use of large page sizes to +non-executable pages. This forces all iTLB entries to be 4K, and removes +the possibility of multiple hits. + +In order to mitigate the vulnerability, KVM initially marks all huge pages +as non-executable. If the guest attempts to execute in one of those pages, +the page is broken down into 4K pages, which are then marked executable. + +If EPT is disabled or not available on the host, KVM is in control of TLB +flushes and the problematic situation cannot happen. However, the shadow +EPT paging mechanism used by nested virtualization is vulnerable, because +the nested guest can trigger multiple iTLB hits by modifying its own +(non-nested) page tables. For simplicity, KVM will make large pages +non-executable in all shadow paging modes. + +Mitigation control on the kernel command line and KVM - module parameter +------------------------------------------------------------------------ + +The KVM hypervisor mitigation mechanism for marking huge pages as +non-executable can be controlled with a module parameter "nx_huge_pages=". +The kernel command line allows to control the iTLB multihit mitigations at +boot time with the option "kvm.nx_huge_pages=". + +The valid arguments for these options are: + + ========== ================================================================ + force Mitigation is enabled. In this case, the mitigation implements + non-executable huge pages in Linux kernel KVM module. All huge + pages in the EPT are marked as non-executable. + If a guest attempts to execute in one of those pages, the page is + broken down into 4K pages, which are then marked executable. + + off Mitigation is disabled. + + auto Enable mitigation only if the platform is affected and the kernel + was not booted with the "mitigations=off" command line parameter. + This is the default option. + ========== ================================================================ + + +Mitigation selection guide +-------------------------- + +1. No virtualization in use +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + The system is protected by the kernel unconditionally and no further + action is required. + +2. Virtualization with trusted guests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + If the guest comes from a trusted source, you may assume that the guest will + not attempt to maliciously exploit these errata and no further action is + required. + +3. Virtualization with untrusted guests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + If the guest comes from an untrusted source, the guest host kernel will need + to apply iTLB multihit mitigation via the kernel command line or kvm + module parameter. From 82e77746f07db70367f66ef272256037d6415353 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 11 Oct 2019 11:59:48 +0200 Subject: [PATCH 144/155] kvm: x86, powerpc: do not allow clearing largepages debugfs entry commit 833b45de69a6016c4b0cebe6765d526a31a81580 upstream. The largepages debugfs entry is incremented/decremented as shadow pages are created or destroyed. Clearing it will result in an underflow, which is harmless to KVM but ugly (and could be misinterpreted by tools that use debugfs information), so make this particular statistic read-only. Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Cc: kvm-ppc@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/x86.c | 6 +++--- include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 10 +++++++--- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 461c6ca2cea5e7..a6f7b75da27afd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -90,8 +90,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ +#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -191,7 +191,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages) }, + { "largepages", VM_STAT(lpages, .mode = 0444) }, { "max_mmu_page_hash_collisions", VM_STAT(max_mmu_page_hash_collisions) }, { NULL } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c8b9d3519c8ed0..eeabf9823164e8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1013,6 +1013,7 @@ enum kvm_stat_kind { struct kvm_stat_data { int offset; + int mode; struct kvm *kvm; }; @@ -1020,6 +1021,7 @@ struct kvm_stats_debugfs_item { const char *name; int offset; enum kvm_stat_kind kind; + int mode; }; extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 89fd40e57cae92..1ba7c312d26a73 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -596,8 +596,9 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) stat_data->kvm = kvm; stat_data->offset = p->offset; + stat_data->mode = p->mode ? p->mode : 0644; kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; - if (!debugfs_create_file(p->name, 0644, + if (!debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry, stat_data, stat_fops_per_vm[p->kind])) @@ -3713,7 +3714,9 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) return -ENOENT; - if (simple_attr_open(inode, file, get, set, fmt)) { + if (simple_attr_open(inode, file, get, + stat_data->mode & S_IWUGO ? set : NULL, + fmt)) { kvm_put_kvm(stat_data->kvm); return -ENOMEM; } @@ -3964,7 +3967,8 @@ static int kvm_init_debug(void) kvm_debugfs_num_entries = 0; for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { - if (!debugfs_create_file(p->name, 0644, kvm_debugfs_dir, + int mode = p->mode ? p->mode : 0644; + if (!debugfs_create_file(p->name, mode, kvm_debugfs_dir, (void *)(long)p->offset, stat_fops[p->kind])) goto out_dir; From 05fe997e30d439e3ff0c7e7e46499e9d41d98ba7 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Thu, 3 Jan 2019 17:14:28 -0800 Subject: [PATCH 145/155] kvm: Convert kvm_lock to a mutex commit 0d9ce162cf46c99628cc5da9510b959c7976735b upstream. It doesn't seem as if there is any particular need for kvm_lock to be a spinlock, so convert the lock to a mutex so that sleepable functions (in particular cond_resched()) can be called while holding it. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- Documentation/virtual/kvm/locking.txt | 4 +--- arch/s390/kvm/kvm-s390.c | 4 ++-- arch/x86/kvm/mmu.c | 4 ++-- arch/x86/kvm/x86.c | 10 ++++----- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 30 +++++++++++++-------------- 6 files changed, 26 insertions(+), 28 deletions(-) diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index 1bb8bcaf849770..635cd6eaf71495 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -15,8 +15,6 @@ The acquisition orders for mutexes are as follows: On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. -For spinlocks, kvm_lock is taken outside kvm->mmu_lock. - Everything else is a leaf: no other lock is taken inside the critical sections. @@ -169,7 +167,7 @@ which time it will be set using the Dirty tracking mechanism described above. ------------ Name: kvm_lock -Type: spinlock_t +Type: mutex Arch: any Protects: - vm_list diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index cab41bc2572fda..ff62a4fe2159a8 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1926,13 +1926,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); if (!kvm->arch.sca) goto out_err; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); sca_offset += 16; if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE) sca_offset = 0; kvm->arch.sca = (struct bsca_block *) ((char *) kvm->arch.sca + sca_offset); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); sprintf(debug_name, "kvm-%u", current->pid); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 87a0601b1c2047..3ad5a853b94886 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5454,7 +5454,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int nr_to_scan = sc->nr_to_scan; unsigned long freed = 0; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { int idx; @@ -5504,7 +5504,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) break; } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return freed; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a6f7b75da27afd..06d7df4331b13d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6156,17 +6156,17 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) continue; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (vcpu->cpu != smp_processor_id()) + if (vcpu->cpu != raw_smp_processor_id()) send_ipi = 1; } } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); if (freq->old < freq->new && send_ipi) { /* @@ -6303,12 +6303,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work) struct kvm_vcpu *vcpu; int i; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); atomic_set(&kvm_guest_has_master_clock, 0); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); } static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index eeabf9823164e8..31d566de33e995 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -140,7 +140,7 @@ static inline bool is_error_page(struct page *page) extern struct kmem_cache *kvm_vcpu_cache; -extern spinlock_t kvm_lock; +extern struct mutex kvm_lock; extern struct list_head vm_list; struct kvm_io_range { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1ba7c312d26a73..345f6cf360434c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); * kvm->lock --> kvm->slots_lock --> kvm->irq_lock */ -DEFINE_SPINLOCK(kvm_lock); +DEFINE_MUTEX(kvm_lock); static DEFINE_RAW_SPINLOCK(kvm_count_lock); LIST_HEAD(vm_list); @@ -668,9 +668,9 @@ static struct kvm *kvm_create_vm(unsigned long type) if (r) goto out_err; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); preempt_notifier_inc(); @@ -716,9 +716,9 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_del(&kvm->vm_list); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { struct kvm_io_bus *bus = kvm_get_bus(kvm, i); @@ -3830,13 +3830,13 @@ static int vm_stat_get(void *_offset, u64 *val) u64 tmp_val; *val = 0; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { stat_tmp.kvm = kvm; vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); *val += tmp_val; } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -3849,12 +3849,12 @@ static int vm_stat_clear(void *_offset, u64 val) if (val) return -EINVAL; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { stat_tmp.kvm = kvm; vm_stat_clear_per_vm((void *)&stat_tmp, 0); } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -3869,13 +3869,13 @@ static int vcpu_stat_get(void *_offset, u64 *val) u64 tmp_val; *val = 0; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { stat_tmp.kvm = kvm; vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); *val += tmp_val; } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -3888,12 +3888,12 @@ static int vcpu_stat_clear(void *_offset, u64 val) if (val) return -EINVAL; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { stat_tmp.kvm = kvm; vcpu_stat_clear_per_vm((void *)&stat_tmp, 0); } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -3914,7 +3914,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) if (!kvm_dev.this_device || !kvm) return; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); if (type == KVM_EVENT_CREATE_VM) { kvm_createvm_count++; kvm_active_vms++; @@ -3923,7 +3923,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) } created = kvm_createvm_count; active = kvm_active_vms; - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); env = kzalloc(sizeof(*env), GFP_KERNEL); if (!env) From 15ac253adbeea9463bf3db8e0bbc533b3ec779e0 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Thu, 3 Jan 2019 16:22:21 -0800 Subject: [PATCH 146/155] kvm: mmu: Do not release the page inside mmu_set_spte() commit 43fdcda96e2550c6d1c46fb8a78801aa2f7276ed upstream. Release the page at the call-site where it was originally acquired. This makes the exit code cleaner for most call sites, since they do not need to duplicate code between success and the failure label. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/mmu.c | 18 +++++++----------- arch/x86/kvm/paging_tmpl.h | 8 +++----- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3ad5a853b94886..d9a78b887bc2f9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2918,8 +2918,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, } } - kvm_release_pfn_clean(pfn); - return ret; } @@ -2954,9 +2952,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, if (ret <= 0) return -1; - for (i = 0; i < ret; i++, gfn++, start++) + for (i = 0; i < ret; i++, gfn++, start++) { mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, page_to_pfn(pages[i]), true, true); + put_page(pages[i]); + } return 0; } @@ -3361,6 +3361,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) return r; + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; @@ -3369,14 +3370,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return RET_PF_RETRY; + return r; } @@ -3954,6 +3952,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) return r; + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; @@ -3962,14 +3961,11 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return RET_PF_RETRY; + return r; } static void nonpaging_init_context(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6288e9d7068e1b..04c4997748687d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -522,6 +522,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true); + kvm_release_pfn_clean(pfn); return true; } @@ -673,7 +674,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, return ret; out_gpte_changed: - kvm_release_pfn_clean(pfn); return RET_PF_RETRY; } @@ -821,6 +821,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, walker.pte_access &= ~ACC_EXEC_MASK; } + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; @@ -834,14 +835,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, level, pfn, map_writable, prefault); ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return RET_PF_RETRY; + return r; } static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) From 3d16a8635826916820de036872c8e90322655ee3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 Jun 2019 13:06:21 +0200 Subject: [PATCH 147/155] KVM: x86: make FNAME(fetch) and __direct_map more similar commit 3fcf2d1bdeb6a513523cb2c77012a6b047aa859c upstream. These two functions are basically doing the same thing through kvm_mmu_get_page, link_shadow_page and mmu_set_spte; yet, for historical reasons, their code looks very different. This patch tries to take the best of each and make them very similar, so that it is easy to understand changes that apply to both of them. Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/mmu.c | 53 ++++++++++++++++++-------------------- arch/x86/kvm/paging_tmpl.h | 30 ++++++++++----------- 2 files changed, 39 insertions(+), 44 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d9a78b887bc2f9..055a677ed663d6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3004,40 +3004,39 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, - int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, + int map_writable, int level, kvm_pfn_t pfn, + bool prefault) { - struct kvm_shadow_walk_iterator iterator; + struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int emulate = 0; - gfn_t pseudo_gfn; + int ret; + gfn_t gfn = gpa >> PAGE_SHIFT; + gfn_t base_gfn = gfn; if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return 0; + return RET_PF_RETRY; - for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { - if (iterator.level == level) { - emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, - write, level, gfn, pfn, prefault, - map_writable); - direct_pte_prefetch(vcpu, iterator.sptep); - ++vcpu->stat.pf_fixed; + for_each_shadow_entry(vcpu, gpa, it) { + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == level) break; - } - drop_large_spte(vcpu, iterator.sptep); - if (!is_shadow_present_pte(*iterator.sptep)) { - u64 base_addr = iterator.addr; + drop_large_spte(vcpu, it.sptep); + if (!is_shadow_present_pte(*it.sptep)) { + sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, + it.level - 1, true, ACC_ALL); - base_addr &= PT64_LVL_ADDR_MASK(iterator.level); - pseudo_gfn = base_addr >> PAGE_SHIFT; - sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, - iterator.level - 1, 1, ACC_ALL); - - link_shadow_page(vcpu, iterator.sptep, sp); + link_shadow_page(vcpu, it.sptep, sp); } } - return emulate; + + ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, + write, level, base_gfn, pfn, prefault, + map_writable); + direct_pte_prefetch(vcpu, it.sptep); + ++vcpu->stat.pf_fixed; + return ret; } static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) @@ -3369,8 +3368,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - + r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -3960,8 +3958,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 04c4997748687d..1c74bcf962320d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -602,6 +602,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; int top_level, ret; + gfn_t base_gfn; direct_access = gw->pte_access; @@ -646,31 +647,29 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, link_shadow_page(vcpu, it.sptep, sp); } - for (; - shadow_walk_okay(&it) && it.level > hlevel; - shadow_walk_next(&it)) { - gfn_t direct_gfn; + base_gfn = gw->gfn; + for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); + base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == hlevel) + break; + validate_direct_spte(vcpu, it.sptep, direct_access); drop_large_spte(vcpu, it.sptep); - if (is_shadow_present_pte(*it.sptep)) - continue; - - direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - - sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, - true, direct_access); - link_shadow_page(vcpu, it.sptep, sp); + if (!is_shadow_present_pte(*it.sptep)) { + sp = kvm_mmu_get_page(vcpu, base_gfn, addr, + it.level - 1, true, direct_access); + link_shadow_page(vcpu, it.sptep, sp); + } } - clear_sp_write_flooding_count(it.sptep); ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, - it.level, gw->gfn, pfn, prefault, map_writable); + it.level, base_gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); - + ++vcpu->stat.pf_fixed; return ret; out_gpte_changed: @@ -833,7 +832,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, level, pfn, map_writable, prefault); - ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: From 0fb1e3c9c7a1b7d3ce527c1b60992626756fd158 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 23 Jun 2019 19:15:49 +0200 Subject: [PATCH 148/155] KVM: x86: remove now unneeded hugepage gfn adjustment commit d679b32611c0102ce33b9e1a4e4b94854ed1812a upstream. After the previous patch, the low bits of the gfn are masked in both FNAME(fetch) and __direct_map, so we do not need to clear them in transparent_hugepage_adjust. Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/mmu.c | 9 +++------ arch/x86/kvm/paging_tmpl.h | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 055a677ed663d6..bd5a63722d2516 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3071,11 +3071,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) } static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t *gfnp, kvm_pfn_t *pfnp, + gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) { kvm_pfn_t pfn = *pfnp; - gfn_t gfn = *gfnp; int level = *levelp; /* @@ -3102,8 +3101,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, mask = KVM_PAGES_PER_HPAGE(level) - 1; VM_BUG_ON((gfn & mask) != (pfn & mask)); if (pfn & mask) { - gfn &= ~mask; - *gfnp = gfn; kvm_release_pfn_clean(pfn); pfn &= ~mask; kvm_get_pfn(pfn); @@ -3367,7 +3364,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); @@ -3957,7 +3954,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1c74bcf962320d..7fa2e95149ee7b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -829,7 +829,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; if (!force_pt_level) - transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); + transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, level, pfn, map_writable, prefault); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); From 6e8121097b41f6e4ff7df1b8359f3085630ba749 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 30 Jun 2019 08:36:21 -0400 Subject: [PATCH 149/155] KVM: x86: change kvm_mmu_page_get_gfn BUG_ON to WARN_ON commit e9f2a760b158551bfbef6db31d2cae45ab8072e5 upstream. Note that in such a case it is quite likely that KVM will BUG_ON in __pte_list_remove when the VM is closed. However, there is no immediate risk of memory corruption in the host so a WARN_ON is enough and it lets you gather traces for debugging. Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/mmu.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index bd5a63722d2516..17c2a123521572 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1008,10 +1008,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) { - if (sp->role.direct) - BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); - else + if (!sp->role.direct) { sp->gfns[index] = gfn; + return; + } + + if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) + pr_err_ratelimited("gfn mismatch under direct page %llx " + "(expected %llx, got %llx)\n", + sp->gfn, + kvm_mmu_page_get_gfn(sp, index), gfn); } /* From 95867919494dce594c3c57bbf31247157a881fb7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Jul 2019 05:14:13 -0400 Subject: [PATCH 150/155] KVM: x86: add tracepoints around __direct_map and FNAME(fetch) commit 335e192a3fa415e1202c8b9ecdaaecd643f823cc upstream. These are useful in debugging shadow paging. Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/mmu.c | 14 ++++----- arch/x86/kvm/mmutrace.h | 59 ++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/paging_tmpl.h | 2 ++ 3 files changed, 68 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 17c2a123521572..d2c4d7615aa262 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -139,9 +139,6 @@ module_param(dbg, bool, 0644); #include -#define CREATE_TRACE_POINTS -#include "mmutrace.h" - #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) @@ -244,6 +241,11 @@ static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); +static bool is_executable_pte(u64 spte); + +#define CREATE_TRACE_POINTS +#include "mmutrace.h" + void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { @@ -2909,10 +2911,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, ret = RET_PF_EMULATE; pgprintk("%s: setting spte %llx\n", __func__, *sptep); - pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", - is_large_pte(*sptep)? "2MB" : "4kB", - *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn, - *sptep, sptep); + trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; @@ -3023,6 +3022,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return RET_PF_RETRY; + trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index c73bf4e4988cb5..918b0d5bf2724c 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -325,6 +325,65 @@ TRACE_EVENT( __entry->kvm_gen == __entry->spte_gen ) ); + +TRACE_EVENT( + kvm_mmu_set_spte, + TP_PROTO(int level, gfn_t gfn, u64 *sptep), + TP_ARGS(level, gfn, sptep), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, spte) + __field(u64, sptep) + __field(u8, level) + /* These depend on page entry type, so compute them now. */ + __field(bool, r) + __field(bool, x) + __field(u8, u) + ), + + TP_fast_assign( + __entry->gfn = gfn; + __entry->spte = *sptep; + __entry->sptep = virt_to_phys(sptep); + __entry->level = level; + __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK); + __entry->x = is_executable_pte(__entry->spte); + __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1; + ), + + TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx", + __entry->gfn, __entry->spte, + __entry->r ? "r" : "-", + __entry->spte & PT_WRITABLE_MASK ? "w" : "-", + __entry->x ? "x" : "-", + __entry->u == -1 ? "" : (__entry->u ? "u" : "-"), + __entry->level, __entry->sptep + ) +); + +TRACE_EVENT( + kvm_mmu_spte_requested, + TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn), + TP_ARGS(addr, level, pfn), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, pfn) + __field(u8, level) + ), + + TP_fast_assign( + __entry->gfn = addr >> PAGE_SHIFT; + __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); + __entry->level = level; + ), + + TP_printk("gfn %llx pfn %llx level %d", + __entry->gfn, __entry->pfn, __entry->level + ) +); + #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 7fa2e95149ee7b..b6b0fb60e0a75b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -649,6 +649,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, base_gfn = gw->gfn; + trace_kvm_mmu_spte_requested(addr, gw->level, pfn); + for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); From 6abe2aaed9a4af9ddbc247b461c365d7cf027ea6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 27 Oct 2019 09:36:37 +0100 Subject: [PATCH 151/155] KVM: vmx, svm: always run with EFER.NXE=1 when shadow paging is active commit 9167ab79936206118cc60e47dcb926c3489f3bd5 upstream. VMX already does so if the host has SMEP, in order to support the combination of CR0.WP=1 and CR4.SMEP=1. However, it is perfectly safe to always do so, and in fact VMX also ends up running with EFER.NXE=1 on old processors that lack the "load EFER" controls, because it may help avoiding a slow MSR write. SVM does not have similar code, but it should since recent AMD processors do support SMEP. So this patch makes the code for the two vendors simpler and more similar, while fixing an issue with CR0.WP=1 and CR4.SMEP=1 on AMD. Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Cc: Joerg Roedel Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/svm.c | 10 ++++++++-- arch/x86/kvm/vmx.c | 14 +++----------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f6adc8db0e325d..52edb8cf1c4005 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -608,8 +608,14 @@ static int get_npt_level(struct kvm_vcpu *vcpu) static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { vcpu->arch.efer = efer; - if (!npt_enabled && !(efer & EFER_LMA)) - efer &= ~EFER_LME; + + if (!npt_enabled) { + /* Shadow paging assumes NX to be available. */ + efer |= EFER_NX; + + if (!(efer & EFER_LMA)) + efer &= ~EFER_LME; + } to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 02c0326dc259e5..cd5a8e888eb6b6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2259,17 +2259,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) u64 guest_efer = vmx->vcpu.arch.efer; u64 ignore_bits = 0; - if (!enable_ept) { - /* - * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing - * host CPUID is more efficient than testing guest CPUID - * or CR4. Host SMEP is anyway a requirement for guest SMEP. - */ - if (boot_cpu_has(X86_FEATURE_SMEP)) - guest_efer |= EFER_NX; - else if (!(guest_efer & EFER_NX)) - ignore_bits |= EFER_NX; - } + /* Shadow paging assumes NX to be available. */ + if (!enable_ept) + guest_efer |= EFER_NX; /* * LMA and LME handled by hardware; SCE meaningless outside long mode. From bb16a6ba5d1ed79b40caea8d924e237f63205b7c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 4 Nov 2019 12:22:02 +0100 Subject: [PATCH 152/155] kvm: mmu: ITLB_MULTIHIT mitigation commit b8e8c8303ff28c61046a4d0f6ea99aea609a7dc0 upstream. With some Intel processors, putting the same virtual address in the TLB as both a 4 KiB and 2 MiB page can confuse the instruction fetch unit and cause the processor to issue a machine check resulting in a CPU lockup. Unfortunately when EPT page tables use huge pages, it is possible for a malicious guest to cause this situation. Add a knob to mark huge pages as non-executable. When the nx_huge_pages parameter is enabled (and we are using EPT), all huge pages are marked as NX. If the guest attempts to execute in one of those pages, the page is broken down into 4K pages, which are then marked executable. This is not an issue for shadow paging (except nested EPT), because then the host is in control of TLB flushes and the problematic situation cannot happen. With nested EPT, again the nested guest can cause problems shadow and direct EPT is treated in the same way. [ tglx: Fixup default to auto and massage wording a bit ] Originally-by: Junaid Shahid Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- .../admin-guide/kernel-parameters.txt | 19 +++ arch/x86/include/asm/kvm_host.h | 2 + arch/x86/kernel/cpu/bugs.c | 13 +- arch/x86/kvm/mmu.c | 141 +++++++++++++++++- arch/x86/kvm/paging_tmpl.h | 29 +++- arch/x86/kvm/x86.c | 9 ++ 6 files changed, 200 insertions(+), 13 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d16b3d41ffe517..496bc24733a676 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1852,6 +1852,19 @@ KVM MMU at runtime. Default is 0 (off) + kvm.nx_huge_pages= + [KVM] Controls the software workaround for the + X86_BUG_ITLB_MULTIHIT bug. + force : Always deploy workaround. + off : Never deploy workaround. + auto : Deploy workaround based on the presence of + X86_BUG_ITLB_MULTIHIT. + + Default is 'auto'. + + If the software workaround is enabled for the host, + guests do need not to enable it for nested guests. + kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. Default is 1 (enabled) @@ -2410,6 +2423,12 @@ l1tf=off [X86] mds=off [X86] tsx_async_abort=off [X86] + kvm.nx_huge_pages=off [X86] + + Exceptions: + This does not have any effect on + kvm.nx_huge_pages when + kvm.nx_huge_pages=force. auto (default) Mitigate all CPU vulnerabilities, but leave SMT diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 00c12158a5dcad..9751feab514948 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -289,6 +289,7 @@ struct kvm_mmu_page { /* hold the gfn of each spte inside spt */ gfn_t *gfns; bool unsync; + bool lpage_disallowed; /* Can't be replaced by an equiv large page */ int root_count; /* Currently serving as active root */ unsigned int unsync_children; struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ @@ -867,6 +868,7 @@ struct kvm_vm_stat { ulong mmu_unsync; ulong remote_tlb_flush; ulong lpages; + ulong nx_lpage_splits; ulong max_mmu_page_hash_collisions; }; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ee558f70d549d0..8596811843ccc8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1225,6 +1225,9 @@ void x86_spec_ctrl_setup_ap(void) x86_amd_ssb_disable(); } +bool itlb_multihit_kvm_mitigation; +EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); + #undef pr_fmt #define pr_fmt(fmt) "L1TF: " fmt @@ -1380,17 +1383,25 @@ static ssize_t l1tf_show_state(char *buf) l1tf_vmx_states[l1tf_vmx_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } + +static ssize_t itlb_multihit_show_state(char *buf) +{ + if (itlb_multihit_kvm_mitigation) + return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); + else + return sprintf(buf, "KVM: Vulnerable\n"); +} #else static ssize_t l1tf_show_state(char *buf) { return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); } -#endif static ssize_t itlb_multihit_show_state(char *buf) { return sprintf(buf, "Processor vulnerable\n"); } +#endif static ssize_t mds_show_state(char *buf) { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d2c4d7615aa262..e3492160aed082 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -48,6 +48,20 @@ #include #include "trace.h" +extern bool itlb_multihit_kvm_mitigation; + +static int __read_mostly nx_huge_pages = -1; + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); + +static struct kernel_param_ops nx_huge_pages_ops = { + .set = set_nx_huge_pages, + .get = param_get_bool, +}; + +module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); +__MODULE_PARM_TYPE(nx_huge_pages, "bool"); + /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: @@ -266,6 +280,11 @@ static inline bool spte_ad_enabled(u64 spte) return !(spte & shadow_acc_track_value); } +static bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); +} + static inline u64 spte_shadow_accessed_mask(u64 spte) { MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); @@ -1078,6 +1097,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); } +static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + if (sp->lpage_disallowed) + return; + + ++kvm->stat.nx_lpage_splits; + sp->lpage_disallowed = true; +} + static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; @@ -1095,6 +1123,12 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } +static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + --kvm->stat.nx_lpage_splits; + sp->lpage_disallowed = false; +} + static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, struct kvm_memory_slot *slot) { @@ -2642,6 +2676,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_reload_remote_mmus(kvm); } + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + sp->role.invalid = 1; return ret; } @@ -2796,6 +2833,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (!speculative) spte |= spte_shadow_accessed_mask(spte); + if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && + is_nx_huge_page_enabled()) { + pte_access &= ~ACC_EXEC_MASK; + } + if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; else @@ -3009,9 +3051,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } +static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, + gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) +{ + int level = *levelp; + u64 spte = *it.sptep; + + if (it.level == level && level > PT_PAGE_TABLE_LEVEL && + is_nx_huge_page_enabled() && + is_shadow_present_pte(spte) && + !is_large_pte(spte)) { + /* + * A small SPTE exists for this pfn, but FNAME(fetch) + * and __direct_map would like to create a large PTE + * instead: just force them to go down another level, + * patching back for them into pfn the next 9 bits of + * the address. + */ + u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); + *pfnp |= gfn & page_mask; + (*levelp)--; + } +} + static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, int map_writable, int level, kvm_pfn_t pfn, - bool prefault) + bool prefault, bool lpage_disallowed) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; @@ -3024,6 +3089,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { + /* + * We cannot overwrite existing page tables with an NX + * large page, as the leaf could be executable. + */ + disallowed_hugepage_adjust(it, gfn, &pfn, &level); + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) break; @@ -3034,6 +3105,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); + if (lpage_disallowed) + account_huge_nx_page(vcpu->kvm, sp); } } @@ -3333,11 +3406,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - bool force_pt_level = false; + bool force_pt_level; kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); + force_pt_level = lpage_disallowed; level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { /* @@ -3371,7 +3447,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); + r = __direct_map(vcpu, v, write, map_writable, level, pfn, + prefault, false); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -3921,6 +3998,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; bool map_writable; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -3931,8 +4010,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, - PT_DIRECTORY_LEVEL); + force_pt_level = + lpage_disallowed || + !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { if (level > PT_DIRECTORY_LEVEL && @@ -3961,7 +4041,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, + prefault, lpage_disallowed); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -5524,8 +5605,56 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } +static bool get_nx_auto_mode(void) +{ + /* Return true when CPU has the bug, and mitigations are ON */ + return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); +} + +static void __set_nx_huge_pages(bool val) +{ + nx_huge_pages = itlb_multihit_kvm_mitigation = val; +} + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) +{ + bool old_val = nx_huge_pages; + bool new_val; + + /* In "auto" mode deploy workaround only if CPU has the bug. */ + if (sysfs_streq(val, "off")) + new_val = 0; + else if (sysfs_streq(val, "force")) + new_val = 1; + else if (sysfs_streq(val, "auto")) + new_val = get_nx_auto_mode(); + else if (strtobool(val, &new_val) < 0) + return -EINVAL; + + __set_nx_huge_pages(new_val); + + if (new_val != old_val) { + struct kvm *kvm; + int idx; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) { + idx = srcu_read_lock(&kvm->srcu); + kvm_mmu_invalidate_zap_all_pages(kvm); + srcu_read_unlock(&kvm->srcu, idx); + } + mutex_unlock(&kvm_lock); + } + + return 0; +} + int kvm_mmu_module_init(void) { + if (nx_huge_pages == -1) + __set_nx_huge_pages(get_nx_auto_mode()); + kvm_mmu_reset_all_pte_masks(); pte_list_desc_cache = kmem_cache_create("pte_list_desc", diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index b6b0fb60e0a75b..8cf7a09bdd736f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -596,13 +596,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int write_fault, int hlevel, - kvm_pfn_t pfn, bool map_writable, bool prefault) + kvm_pfn_t pfn, bool map_writable, bool prefault, + bool lpage_disallowed) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; int top_level, ret; - gfn_t base_gfn; + gfn_t gfn, base_gfn; direct_access = gw->pte_access; @@ -647,13 +648,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, link_shadow_page(vcpu, it.sptep, sp); } - base_gfn = gw->gfn; + /* + * FNAME(page_fault) might have clobbered the bottom bits of + * gw->gfn, restore them from the virtual address. + */ + gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); + base_gfn = gfn; trace_kvm_mmu_spte_requested(addr, gw->level, pfn); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); - base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + + /* + * We cannot overwrite existing page tables with an NX + * large page, as the leaf could be executable. + */ + disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); + + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == hlevel) break; @@ -665,6 +678,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, sp = kvm_mmu_get_page(vcpu, base_gfn, addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); + if (lpage_disallowed) + account_huge_nx_page(vcpu->kvm, sp); } } @@ -741,9 +756,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; kvm_pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - bool force_pt_level = false; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); + bool force_pt_level = lpage_disallowed; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -833,7 +850,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (!force_pt_level) transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, - level, pfn, map_writable, prefault); + level, pfn, map_writable, prefault, lpage_disallowed); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 06d7df4331b13d..0804ad3326dcb9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -192,6 +192,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages, .mode = 0444) }, + { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) }, { "max_mmu_page_hash_collisions", VM_STAT(max_mmu_page_hash_collisions) }, { NULL } @@ -1069,6 +1070,14 @@ u64 kvm_get_arch_capabilities(void) rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data); + /* + * If nx_huge_pages is enabled, KVM's shadow paging will ensure that + * the nested hypervisor runs with NX huge pages. If it is not, + * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other + * L1 guests, so it need not worry about its own (L2) guests. + */ + data |= ARCH_CAP_PSCHANGE_MC_NO; + /* * If we're doing cache flushes (either "always" or "cond") * we will do one whenever the guest does a vmlaunch/vmresume. From 73959112cc8dbd30a09e169a9da868f40e750e2d Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Fri, 1 Nov 2019 00:14:08 +0100 Subject: [PATCH 153/155] kvm: Add helper function for creating VM worker threads commit c57c80467f90e5504c8df9ad3555d2c78800bf94 upstream. Add a function to create a kernel thread associated with a given VM. In particular, it ensures that the worker thread inherits the priority and cgroups of the calling thread. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- include/linux/kvm_host.h | 6 +++ virt/kvm/kvm_main.c | 84 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 31d566de33e995..bb4758ffd40372 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1260,4 +1260,10 @@ static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */ +typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data); + +int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, + uintptr_t data, const char *name, + struct task_struct **thread_ptr); + #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 345f6cf360434c..5e640f82b314c2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -4155,3 +4156,86 @@ void kvm_exit(void) kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); + +struct kvm_vm_worker_thread_context { + struct kvm *kvm; + struct task_struct *parent; + struct completion init_done; + kvm_vm_thread_fn_t thread_fn; + uintptr_t data; + int err; +}; + +static int kvm_vm_worker_thread(void *context) +{ + /* + * The init_context is allocated on the stack of the parent thread, so + * we have to locally copy anything that is needed beyond initialization + */ + struct kvm_vm_worker_thread_context *init_context = context; + struct kvm *kvm = init_context->kvm; + kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; + uintptr_t data = init_context->data; + int err; + + err = kthread_park(current); + /* kthread_park(current) is never supposed to return an error */ + WARN_ON(err != 0); + if (err) + goto init_complete; + + err = cgroup_attach_task_all(init_context->parent, current); + if (err) { + kvm_err("%s: cgroup_attach_task_all failed with err %d\n", + __func__, err); + goto init_complete; + } + + set_user_nice(current, task_nice(init_context->parent)); + +init_complete: + init_context->err = err; + complete(&init_context->init_done); + init_context = NULL; + + if (err) + return err; + + /* Wait to be woken up by the spawner before proceeding. */ + kthread_parkme(); + + if (!kthread_should_stop()) + err = thread_fn(kvm, data); + + return err; +} + +int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, + uintptr_t data, const char *name, + struct task_struct **thread_ptr) +{ + struct kvm_vm_worker_thread_context init_context = {}; + struct task_struct *thread; + + *thread_ptr = NULL; + init_context.kvm = kvm; + init_context.parent = current; + init_context.thread_fn = thread_fn; + init_context.data = data; + init_completion(&init_context.init_done); + + thread = kthread_run(kvm_vm_worker_thread, &init_context, + "%s-%d", name, task_pid_nr(current)); + if (IS_ERR(thread)) + return PTR_ERR(thread); + + /* kthread_run is never supposed to return NULL */ + WARN_ON(thread == NULL); + + wait_for_completion(&init_context.init_done); + + if (!init_context.err) + *thread_ptr = thread; + + return init_context.err; +} From 2d371f8836c5d633f9f495c9165eaf814643539d Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Fri, 1 Nov 2019 00:14:14 +0100 Subject: [PATCH 154/155] kvm: x86: mmu: Recovery of shattered NX large pages commit 1aa9b9572b10529c2e64e2b8f44025d86e124308 upstream. The page table pages corresponding to broken down large pages are zapped in FIFO order, so that the large page can potentially be recovered, if it is not longer being used for execution. This removes the performance penalty for walking deeper EPT page tables. By default, one large page will last about one hour once the guest reaches a steady state. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- .../admin-guide/kernel-parameters.txt | 6 + arch/x86/include/asm/kvm_host.h | 4 + arch/x86/kvm/mmu.c | 129 ++++++++++++++++++ arch/x86/kvm/mmu.h | 4 + arch/x86/kvm/x86.c | 11 ++ virt/kvm/kvm_main.c | 30 +++- 6 files changed, 183 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 496bc24733a676..05596e05bc7169 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1865,6 +1865,12 @@ If the software workaround is enabled for the host, guests do need not to enable it for nested guests. + kvm.nx_huge_pages_recovery_ratio= + [KVM] Controls how many 4KiB pages are periodically zapped + back to huge pages. 0 disables the recovery, otherwise if + the value is N KVM will zap 1/Nth of the 4KiB pages every + minute. The default is 60. + kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. Default is 1 (enabled) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9751feab514948..d0e17813a9b0a5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -277,6 +277,7 @@ struct kvm_rmap_head { struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; + struct list_head lpage_disallowed_link; /* * The following two entries are used to key the shadow page in the @@ -780,6 +781,7 @@ struct kvm_arch { */ struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; + struct list_head lpage_disallowed_mmu_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; @@ -855,6 +857,8 @@ struct kvm_arch { bool x2apic_format; bool x2apic_broadcast_quirk_disabled; + + struct task_struct *nx_lpage_recovery_thread; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e3492160aed082..8cd26e50d41c02 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -51,16 +52,26 @@ extern bool itlb_multihit_kvm_mitigation; static int __read_mostly nx_huge_pages = -1; +static uint __read_mostly nx_huge_pages_recovery_ratio = 60; static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); static struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, .get = param_get_bool, }; +static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { + .set = set_nx_huge_pages_recovery_ratio, + .get = param_get_uint, +}; + module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); __MODULE_PARM_TYPE(nx_huge_pages, "bool"); +module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, + &nx_huge_pages_recovery_ratio, 0644); +__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); /* * When setting this variable to true it enables Two-Dimensional-Paging @@ -1103,6 +1114,8 @@ static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) return; ++kvm->stat.nx_lpage_splits; + list_add_tail(&sp->lpage_disallowed_link, + &kvm->arch.lpage_disallowed_mmu_pages); sp->lpage_disallowed = true; } @@ -1127,6 +1140,7 @@ static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) { --kvm->stat.nx_lpage_splits; sp->lpage_disallowed = false; + list_del(&sp->lpage_disallowed_link); } static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, @@ -5643,6 +5657,8 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) idx = srcu_read_lock(&kvm->srcu); kvm_mmu_invalidate_zap_all_pages(kvm); srcu_read_unlock(&kvm->srcu, idx); + + wake_up_process(kvm->arch.nx_lpage_recovery_thread); } mutex_unlock(&kvm_lock); } @@ -5720,3 +5736,116 @@ void kvm_mmu_module_exit(void) unregister_shrinker(&mmu_shrinker); mmu_audit_disable(); } + +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) +{ + unsigned int old_val; + int err; + + old_val = nx_huge_pages_recovery_ratio; + err = param_set_uint(val, kp); + if (err) + return err; + + if (READ_ONCE(nx_huge_pages) && + !old_val && nx_huge_pages_recovery_ratio) { + struct kvm *kvm; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) + wake_up_process(kvm->arch.nx_lpage_recovery_thread); + + mutex_unlock(&kvm_lock); + } + + return err; +} + +static void kvm_recover_nx_lpages(struct kvm *kvm) +{ + int rcu_idx; + struct kvm_mmu_page *sp; + unsigned int ratio; + LIST_HEAD(invalid_list); + ulong to_zap; + + rcu_idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + + ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; + while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { + /* + * We use a separate list instead of just using active_mmu_pages + * because the number of lpage_disallowed pages is expected to + * be relatively small compared to the total. + */ + sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, + struct kvm_mmu_page, + lpage_disallowed_link); + WARN_ON_ONCE(!sp->lpage_disallowed); + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->lpage_disallowed); + + if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_mmu_commit_zap_page(kvm, &invalid_list); + if (to_zap) + cond_resched_lock(&kvm->mmu_lock); + } + } + + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, rcu_idx); +} + +static long get_nx_lpage_recovery_timeout(u64 start_time) +{ + return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) + ? start_time + 60 * HZ - get_jiffies_64() + : MAX_SCHEDULE_TIMEOUT; +} + +static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) +{ + u64 start_time; + long remaining_time; + + while (true) { + start_time = get_jiffies_64(); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop() && remaining_time > 0) { + schedule_timeout(remaining_time); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + set_current_state(TASK_INTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + + if (kthread_should_stop()) + return 0; + + kvm_recover_nx_lpages(kvm); + } +} + +int kvm_mmu_post_init_vm(struct kvm *kvm) +{ + int err; + + err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, + "kvm-nx-lpage-recovery", + &kvm->arch.nx_lpage_recovery_thread); + if (!err) + kthread_unpark(kvm->arch.nx_lpage_recovery_thread); + + return err; +} + +void kvm_mmu_pre_destroy_vm(struct kvm *kvm) +{ + if (kvm->arch.nx_lpage_recovery_thread) + kthread_stop(kvm->arch.nx_lpage_recovery_thread); +} diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index efc857615d8ea4..068feab64acf17 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -195,4 +195,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); + +int kvm_mmu_post_init_vm(struct kvm *kvm); +void kvm_mmu_pre_destroy_vm(struct kvm *kvm); + #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0804ad3326dcb9..dcee3282112dc5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8370,6 +8370,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); + INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -8399,6 +8400,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return 0; } +int kvm_arch_post_init_vm(struct kvm *kvm) +{ + return kvm_mmu_post_init_vm(kvm); +} + static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { int r; @@ -8502,6 +8508,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) } EXPORT_SYMBOL_GPL(x86_set_memory_region); +void kvm_arch_pre_destroy_vm(struct kvm *kvm) +{ + kvm_mmu_pre_destroy_vm(kvm); +} + void kvm_arch_destroy_vm(struct kvm *kvm) { if (current->mm == kvm->mm) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5e640f82b314c2..ea61162b2b5335 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -608,6 +608,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) return 0; } +/* + * Called after the VM is otherwise initialized, but just before adding it to + * the vm_list. + */ +int __weak kvm_arch_post_init_vm(struct kvm *kvm) +{ + return 0; +} + +/* + * Called just after removing the VM from the vm_list, but before doing any + * other destruction. + */ +void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) +{ +} + static struct kvm *kvm_create_vm(unsigned long type) { int r, i; @@ -662,10 +679,14 @@ static struct kvm *kvm_create_vm(unsigned long type) rcu_assign_pointer(kvm->buses[i], kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL)); if (!kvm->buses[i]) - goto out_err; + goto out_err_no_mmu_notifier; } r = kvm_init_mmu_notifier(kvm); + if (r) + goto out_err_no_mmu_notifier; + + r = kvm_arch_post_init_vm(kvm); if (r) goto out_err; @@ -678,6 +699,11 @@ static struct kvm *kvm_create_vm(unsigned long type) return kvm; out_err: +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) + if (kvm->mmu_notifier.ops) + mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); +#endif +out_err_no_mmu_notifier: cleanup_srcu_struct(&kvm->irq_srcu); out_err_no_irq_srcu: cleanup_srcu_struct(&kvm->srcu); @@ -720,6 +746,8 @@ static void kvm_destroy_vm(struct kvm *kvm) mutex_lock(&kvm_lock); list_del(&kvm->vm_list); mutex_unlock(&kvm_lock); + kvm_arch_pre_destroy_vm(kvm); + kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { struct kvm_io_bus *bus = kvm_get_bus(kvm, i); From 775d01b65b5daa002a9ba60f2d2bb3b1a6ce12fb Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 12 Nov 2019 19:19:08 +0100 Subject: [PATCH 155/155] Linux 4.14.154 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2819ed540ce2bd..4d2d556915486b 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 4 PATCHLEVEL = 14 -SUBLEVEL = 153 +SUBLEVEL = 154 EXTRAVERSION = NAME = Petit Gorille