summaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile34
-rw-r--r--net/core/datagram.c875
-rw-r--r--net/core/dev.c9903
-rw-r--r--net/core/dev_addr_lists.c851
-rw-r--r--net/core/dev_ioctl.c519
-rw-r--r--net/core/devlink.c4816
-rw-r--r--net/core/drop_monitor.c472
-rw-r--r--net/core/dst.c344
-rw-r--r--net/core/dst_cache.c168
-rw-r--r--net/core/ethtool.c2916
-rw-r--r--net/core/failover.c315
-rw-r--r--net/core/fib_notifier.c189
-rw-r--r--net/core/fib_rules.c1235
-rw-r--r--net/core/filter.c7333
-rw-r--r--net/core/flow_dissector.c1470
-rw-r--r--net/core/gen_estimator.c274
-rw-r--r--net/core/gen_stats.c396
-rw-r--r--net/core/gro_cells.c108
-rw-r--r--net/core/hwbm.c90
-rw-r--r--net/core/link_watch.c253
-rw-r--r--net/core/lwt_bpf.c398
-rw-r--r--net/core/lwtunnel.c417
-rw-r--r--net/core/neighbour.c3300
-rw-r--r--net/core/net-procfs.c397
-rw-r--r--net/core/net-sysfs.c1842
-rw-r--r--net/core/net-sysfs.h12
-rw-r--r--net/core/net-traces.c50
-rw-r--r--net/core/net_namespace.c1195
-rw-r--r--net/core/netclassid_cgroup.c156
-rw-r--r--net/core/netevent.c67
-rw-r--r--net/core/netpoll.c856
-rw-r--r--net/core/netprio_cgroup.c308
-rw-r--r--net/core/page_pool.c317
-rw-r--r--net/core/pktgen.c3903
-rw-r--r--net/core/ptp_classifier.c193
-rw-r--r--net/core/request_sock.c136
-rw-r--r--net/core/rtnetlink.c4911
-rw-r--r--net/core/scm.c350
-rw-r--r--net/core/secure_seq.c200
-rw-r--r--net/core/skbuff.c5644
-rw-r--r--net/core/sock.c3534
-rw-r--r--net/core/sock_diag.c337
-rw-r--r--net/core/sock_reuseport.c343
-rw-r--r--net/core/stream.c212
-rw-r--r--net/core/sysctl_net_core.c616
-rw-r--r--net/core/timestamping.c84
-rw-r--r--net/core/tso.c87
-rw-r--r--net/core/utils.c490
-rw-r--r--net/core/xdp.c400
49 files changed, 63316 insertions, 0 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
new file mode 100644
index 000000000..80175e6a2
--- /dev/null
+++ b/net/core/Makefile
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Linux networking core.
+#
+
+obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
+ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o
+
+obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
+
+obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
+ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
+ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
+ fib_notifier.o xdp.o
+
+obj-y += net-sysfs.o
+obj-$(CONFIG_PAGE_POOL) += page_pool.o
+obj-$(CONFIG_PROC_FS) += net-procfs.o
+obj-$(CONFIG_NET_PKTGEN) += pktgen.o
+obj-$(CONFIG_NETPOLL) += netpoll.o
+obj-$(CONFIG_FIB_RULES) += fib_rules.o
+obj-$(CONFIG_TRACEPOINTS) += net-traces.o
+obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
+obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
+obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
+obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
+obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
+obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
+obj-$(CONFIG_DST_CACHE) += dst_cache.o
+obj-$(CONFIG_HWBM) += hwbm.o
+obj-$(CONFIG_NET_DEVLINK) += devlink.o
+obj-$(CONFIG_GRO_CELLS) += gro_cells.o
+obj-$(CONFIG_FAILOVER) += failover.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
new file mode 100644
index 000000000..865a8cb7b
--- /dev/null
+++ b/net/core/datagram.c
@@ -0,0 +1,875 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * SUCS NET3:
+ *
+ * Generic datagram handling routines. These are generic for all
+ * protocols. Possibly a generic IP version on top of these would
+ * make sense. Not tonight however 8-).
+ * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
+ * NetROM layer all have identical poll code and mostly
+ * identical recvmsg() code. So we share it here. The poll was
+ * shared before but buried in udp.c so I moved it.
+ *
+ * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
+ * udp.c code)
+ *
+ * Fixes:
+ * Alan Cox : NULL return from skb_peek_copy()
+ * understood
+ * Alan Cox : Rewrote skb_read_datagram to avoid the
+ * skb_peek_copy stuff.
+ * Alan Cox : Added support for SOCK_SEQPACKET.
+ * IPX can no longer use the SO_TYPE hack
+ * but AX.25 now works right, and SPX is
+ * feasible.
+ * Alan Cox : Fixed write poll of non IP protocol
+ * crash.
+ * Florian La Roche: Changed for my new skbuff handling.
+ * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET.
+ * Linus Torvalds : BSD semantic fixes.
+ * Alan Cox : Datagram iovec handling
+ * Darryl Miles : Fixed non-blocking SOCK_STREAM.
+ * Alan Cox : POSIXisms
+ * Pete Wyckoff : Unconnected accept() fix.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/poll.h>
+#include <linux/highmem.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+
+#include <net/checksum.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <trace/events/skb.h>
+#include <net/busy_poll.h>
+
+/*
+ * Is a socket 'connection oriented' ?
+ */
+static inline int connection_based(struct sock *sk)
+{
+ return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
+}
+
+static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync,
+ void *key)
+{
+ /*
+ * Avoid a wakeup if event not interesting for us
+ */
+ if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR)))
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+/*
+ * Wait for the last received packet to be different from skb
+ */
+int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
+ const struct sk_buff *skb)
+{
+ int error;
+ DEFINE_WAIT_FUNC(wait, receiver_wake_function);
+
+ prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+ /* Socket errors? */
+ error = sock_error(sk);
+ if (error)
+ goto out_err;
+
+ if (READ_ONCE(sk->sk_receive_queue.prev) != skb)
+ goto out;
+
+ /* Socket shut down? */
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ goto out_noerr;
+
+ /* Sequenced packets can come disconnected.
+ * If so we report the problem
+ */
+ error = -ENOTCONN;
+ if (connection_based(sk) &&
+ !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
+ goto out_err;
+
+ /* handle signals */
+ if (signal_pending(current))
+ goto interrupted;
+
+ error = 0;
+ *timeo_p = schedule_timeout(*timeo_p);
+out:
+ finish_wait(sk_sleep(sk), &wait);
+ return error;
+interrupted:
+ error = sock_intr_errno(*timeo_p);
+out_err:
+ *err = error;
+ goto out;
+out_noerr:
+ *err = 0;
+ error = 1;
+ goto out;
+}
+EXPORT_SYMBOL(__skb_wait_for_more_packets);
+
+static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
+{
+ struct sk_buff *nskb;
+
+ if (skb->peeked)
+ return skb;
+
+ /* We have to unshare an skb before modifying it. */
+ if (!skb_shared(skb))
+ goto done;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return ERR_PTR(-ENOMEM);
+
+ skb->prev->next = nskb;
+ skb->next->prev = nskb;
+ nskb->prev = skb->prev;
+ nskb->next = skb->next;
+
+ consume_skb(skb);
+ skb = nskb;
+
+done:
+ skb->peeked = 1;
+
+ return skb;
+}
+
+struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
+ struct sk_buff_head *queue,
+ unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb),
+ int *peeked, int *off, int *err,
+ struct sk_buff **last)
+{
+ bool peek_at_off = false;
+ struct sk_buff *skb;
+ int _off = 0;
+
+ if (unlikely(flags & MSG_PEEK && *off >= 0)) {
+ peek_at_off = true;
+ _off = *off;
+ }
+
+ *last = queue->prev;
+ skb_queue_walk(queue, skb) {
+ if (flags & MSG_PEEK) {
+ if (peek_at_off && _off >= skb->len &&
+ (_off || skb->peeked)) {
+ _off -= skb->len;
+ continue;
+ }
+ if (!skb->len) {
+ skb = skb_set_peeked(skb);
+ if (IS_ERR(skb)) {
+ *err = PTR_ERR(skb);
+ return NULL;
+ }
+ }
+ *peeked = 1;
+ refcount_inc(&skb->users);
+ } else {
+ __skb_unlink(skb, queue);
+ if (destructor)
+ destructor(sk, skb);
+ }
+ *off = _off;
+ return skb;
+ }
+ return NULL;
+}
+
+/**
+ * __skb_try_recv_datagram - Receive a datagram skbuff
+ * @sk: socket
+ * @flags: MSG\_ flags
+ * @destructor: invoked under the receive lock on successful dequeue
+ * @peeked: returns non-zero if this packet has been seen before
+ * @off: an offset in bytes to peek skb from. Returns an offset
+ * within an skb where data actually starts
+ * @err: error code returned
+ * @last: set to last peeked message to inform the wait function
+ * what to look for when peeking
+ *
+ * Get a datagram skbuff, understands the peeking, nonblocking wakeups
+ * and possible races. This replaces identical code in packet, raw and
+ * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
+ * the long standing peek and read race for datagram sockets. If you
+ * alter this routine remember it must be re-entrant.
+ *
+ * This function will lock the socket if a skb is returned, so
+ * the caller needs to unlock the socket in that case (usually by
+ * calling skb_free_datagram). Returns NULL with @err set to
+ * -EAGAIN if no data was available or to some other value if an
+ * error was detected.
+ *
+ * * It does not lock socket since today. This function is
+ * * free of race conditions. This measure should/can improve
+ * * significantly datagram socket latencies at high loads,
+ * * when data copying to user space takes lots of time.
+ * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
+ * * 8) Great win.)
+ * * --ANK (980729)
+ *
+ * The order of the tests when we find no data waiting are specified
+ * quite explicitly by POSIX 1003.1g, don't change them without having
+ * the standard around please.
+ */
+struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb),
+ int *peeked, int *off, int *err,
+ struct sk_buff **last)
+{
+ struct sk_buff_head *queue = &sk->sk_receive_queue;
+ struct sk_buff *skb;
+ unsigned long cpu_flags;
+ /*
+ * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
+ */
+ int error = sock_error(sk);
+
+ if (error)
+ goto no_packet;
+
+ *peeked = 0;
+ do {
+ /* Again only user level code calls this function, so nothing
+ * interrupt level will suddenly eat the receive_queue.
+ *
+ * Look at current nfs client by the way...
+ * However, this function was correct in any case. 8)
+ */
+ spin_lock_irqsave(&queue->lock, cpu_flags);
+ skb = __skb_try_recv_from_queue(sk, queue, flags, destructor,
+ peeked, off, &error, last);
+ spin_unlock_irqrestore(&queue->lock, cpu_flags);
+ if (error)
+ goto no_packet;
+ if (skb)
+ return skb;
+
+ if (!sk_can_busy_loop(sk))
+ break;
+
+ sk_busy_loop(sk, flags & MSG_DONTWAIT);
+ } while (READ_ONCE(sk->sk_receive_queue.prev) != *last);
+
+ error = -EAGAIN;
+
+no_packet:
+ *err = error;
+ return NULL;
+}
+EXPORT_SYMBOL(__skb_try_recv_datagram);
+
+struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb),
+ int *peeked, int *off, int *err)
+{
+ struct sk_buff *skb, *last;
+ long timeo;
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ do {
+ skb = __skb_try_recv_datagram(sk, flags, destructor, peeked,
+ off, err, &last);
+ if (skb)
+ return skb;
+
+ if (*err != -EAGAIN)
+ break;
+ } while (timeo &&
+ !__skb_wait_for_more_packets(sk, err, &timeo, last));
+
+ return NULL;
+}
+EXPORT_SYMBOL(__skb_recv_datagram);
+
+struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
+ int noblock, int *err)
+{
+ int peeked, off = 0;
+
+ return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+ NULL, &peeked, &off, err);
+}
+EXPORT_SYMBOL(skb_recv_datagram);
+
+void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
+{
+ consume_skb(skb);
+ sk_mem_reclaim_partial(sk);
+}
+EXPORT_SYMBOL(skb_free_datagram);
+
+void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
+{
+ bool slow;
+
+ if (!skb_unref(skb)) {
+ sk_peek_offset_bwd(sk, len);
+ return;
+ }
+
+ slow = lock_sock_fast(sk);
+ sk_peek_offset_bwd(sk, len);
+ skb_orphan(skb);
+ sk_mem_reclaim_partial(sk);
+ unlock_sock_fast(sk, slow);
+
+ /* skb is now orphaned, can be freed outside of locked section */
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(__skb_free_datagram_locked);
+
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
+ struct sk_buff *skb, unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb))
+{
+ int err = 0;
+
+ if (flags & MSG_PEEK) {
+ err = -ENOENT;
+ spin_lock_bh(&sk_queue->lock);
+ if (skb->next) {
+ __skb_unlink(skb, sk_queue);
+ refcount_dec(&skb->users);
+ if (destructor)
+ destructor(sk, skb);
+ err = 0;
+ }
+ spin_unlock_bh(&sk_queue->lock);
+ }
+
+ atomic_inc(&sk->sk_drops);
+ return err;
+}
+EXPORT_SYMBOL(__sk_queue_drop_skb);
+
+/**
+ * skb_kill_datagram - Free a datagram skbuff forcibly
+ * @sk: socket
+ * @skb: datagram skbuff
+ * @flags: MSG\_ flags
+ *
+ * This function frees a datagram skbuff that was received by
+ * skb_recv_datagram. The flags argument must match the one
+ * used for skb_recv_datagram.
+ *
+ * If the MSG_PEEK flag is set, and the packet is still on the
+ * receive queue of the socket, it will be taken off the queue
+ * before it is freed.
+ *
+ * This function currently only disables BH when acquiring the
+ * sk_receive_queue lock. Therefore it must not be used in a
+ * context where that lock is acquired in an IRQ context.
+ *
+ * It returns 0 if the packet was removed by us.
+ */
+
+int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
+{
+ int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags,
+ NULL);
+
+ kfree_skb(skb);
+ sk_mem_reclaim_partial(sk);
+ return err;
+}
+EXPORT_SYMBOL(skb_kill_datagram);
+
+/**
+ * skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: iovec iterator to copy to
+ * @len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset, start_off = offset, n;
+ struct sk_buff *frag_iter;
+
+ trace_skb_copy_datagram_iovec(skb, len);
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ n = copy_to_iter(skb->data + offset, copy, to);
+ offset += n;
+ if (n != copy)
+ goto short_copy;
+ if ((len -= copy) == 0)
+ return 0;
+ }
+
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ n = copy_page_to_iter(skb_frag_page(frag),
+ frag->page_offset + offset -
+ start, copy, to);
+ offset += n;
+ if (n != copy)
+ goto short_copy;
+ if (!(len -= copy))
+ return 0;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_datagram_iter(frag_iter, offset - start,
+ to, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ }
+ start = end;
+ }
+ if (!len)
+ return 0;
+
+ /* This is not really a user copy fault, but rather someone
+ * gave us a bogus length on the skb. We should probably
+ * print a warning here as it may indicate a kernel bug.
+ */
+
+fault:
+ iov_iter_revert(to, offset - start_off);
+ return -EFAULT;
+
+short_copy:
+ if (iov_iter_count(to))
+ goto fault;
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_copy_datagram_iter);
+
+/**
+ * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying to
+ * @from: the copy source
+ * @len: amount of data to copy to buffer from iovec
+ *
+ * Returns 0 or -EFAULT.
+ */
+int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
+ struct iov_iter *from,
+ int len)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ if (copy_from_iter(skb->data + offset, copy, from) != copy)
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ }
+
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ if ((copy = end - offset) > 0) {
+ size_t copied;
+
+ if (copy > len)
+ copy = len;
+ copied = copy_page_from_iter(skb_frag_page(frag),
+ frag->page_offset + offset - start,
+ copy, from);
+ if (copied != copy)
+ goto fault;
+
+ if (!(len -= copy))
+ return 0;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_datagram_from_iter(frag_iter,
+ offset - start,
+ from, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ }
+ start = end;
+ }
+ if (!len)
+ return 0;
+
+fault:
+ return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_from_iter);
+
+int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
+ struct iov_iter *from, size_t length)
+{
+ int frag = skb_shinfo(skb)->nr_frags;
+
+ while (length && iov_iter_count(from)) {
+ struct page *pages[MAX_SKB_FRAGS];
+ size_t start;
+ ssize_t copied;
+ unsigned long truesize;
+ int n = 0;
+
+ if (frag == MAX_SKB_FRAGS)
+ return -EMSGSIZE;
+
+ copied = iov_iter_get_pages(from, pages, length,
+ MAX_SKB_FRAGS - frag, &start);
+ if (copied < 0)
+ return -EFAULT;
+
+ iov_iter_advance(from, copied);
+ length -= copied;
+
+ truesize = PAGE_ALIGN(copied + start);
+ skb->data_len += copied;
+ skb->len += copied;
+ skb->truesize += truesize;
+ if (sk && sk->sk_type == SOCK_STREAM) {
+ sk->sk_wmem_queued += truesize;
+ sk_mem_charge(sk, truesize);
+ } else {
+ refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+ }
+ while (copied) {
+ int size = min_t(int, copied, PAGE_SIZE - start);
+ skb_fill_page_desc(skb, frag++, pages[n], start, size);
+ start = 0;
+ copied -= size;
+ n++;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL(__zerocopy_sg_from_iter);
+
+/**
+ * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
+ * @skb: buffer to copy
+ * @from: the source to copy from
+ *
+ * The function will first copy up to headlen, and then pin the userspace
+ * pages and build frags through them.
+ *
+ * Returns 0, -EFAULT or -EMSGSIZE.
+ */
+int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
+{
+ int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));
+
+ /* copy up to skb headlen */
+ if (skb_copy_datagram_from_iter(skb, 0, from, copy))
+ return -EFAULT;
+
+ return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
+}
+EXPORT_SYMBOL(zerocopy_sg_from_iter);
+
+static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len,
+ __wsum *csump)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset, start_off = offset;
+ struct sk_buff *frag_iter;
+ int pos = 0;
+ int n;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
+ offset += n;
+ if (n != copy)
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ pos = copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ if ((copy = end - offset) > 0) {
+ __wsum csum2 = 0;
+ struct page *page = skb_frag_page(frag);
+ u8 *vaddr = kmap(page);
+
+ if (copy > len)
+ copy = len;
+ n = csum_and_copy_to_iter(vaddr + frag->page_offset +
+ offset - start, copy,
+ &csum2, to);
+ kunmap(page);
+ offset += n;
+ if (n != copy)
+ goto fault;
+ *csump = csum_block_add(*csump, csum2, pos);
+ if (!(len -= copy))
+ return 0;
+ pos += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ __wsum csum2 = 0;
+ if (copy > len)
+ copy = len;
+ if (skb_copy_and_csum_datagram(frag_iter,
+ offset - start,
+ to, copy,
+ &csum2))
+ goto fault;
+ *csump = csum_block_add(*csump, csum2, pos);
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ pos += copy;
+ }
+ start = end;
+ }
+ if (!len)
+ return 0;
+
+fault:
+ iov_iter_revert(to, offset - start_off);
+ return -EFAULT;
+}
+
+__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
+{
+ __sum16 sum;
+
+ sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ }
+ if (!skb_shared(skb))
+ skb->csum_valid = !sum;
+ return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete_head);
+
+__sum16 __skb_checksum_complete(struct sk_buff *skb)
+{
+ __wsum csum;
+ __sum16 sum;
+
+ csum = skb_checksum(skb, 0, skb->len, 0);
+
+ /* skb->csum holds pseudo checksum */
+ sum = csum_fold(csum_add(skb->csum, csum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ }
+
+ if (!skb_shared(skb)) {
+ /* Save full packet checksum */
+ skb->csum = csum;
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum_complete_sw = 1;
+ skb->csum_valid = !sum;
+ }
+
+ return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete);
+
+/**
+ * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec.
+ * @skb: skbuff
+ * @hlen: hardware length
+ * @msg: destination
+ *
+ * Caller _must_ check that skb will fit to this iovec.
+ *
+ * Returns: 0 - success.
+ * -EINVAL - checksum failure.
+ * -EFAULT - fault during copy.
+ */
+int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
+ int hlen, struct msghdr *msg)
+{
+ __wsum csum;
+ int chunk = skb->len - hlen;
+
+ if (!chunk)
+ return 0;
+
+ if (msg_data_left(msg) < chunk) {
+ if (__skb_checksum_complete(skb))
+ return -EINVAL;
+ if (skb_copy_datagram_msg(skb, hlen, msg, chunk))
+ goto fault;
+ } else {
+ csum = csum_partial(skb->data, hlen, skb->csum);
+ if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter,
+ chunk, &csum))
+ goto fault;
+
+ if (csum_fold(csum)) {
+ iov_iter_revert(&msg->msg_iter, chunk);
+ return -EINVAL;
+ }
+
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(NULL);
+ }
+ return 0;
+fault:
+ return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
+
+/**
+ * datagram_poll - generic datagram poll
+ * @file: file struct
+ * @sock: socket
+ * @wait: poll table
+ *
+ * Datagram poll: Again totally generic. This also handles
+ * sequenced packet sockets providing the socket receive queue
+ * is only ever holding data ready to receive.
+ *
+ * Note: when you *don't* use this routine for this protocol,
+ * and you use a different write policy from sock_writeable()
+ * then please supply your own write_space callback.
+ */
+__poll_t datagram_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ __poll_t mask;
+
+ sock_poll_wait(file, sock, wait);
+ mask = 0;
+
+ /* exceptional events? */
+ if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
+ mask |= EPOLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= EPOLLHUP;
+
+ /* readable? */
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
+ mask |= EPOLLIN | EPOLLRDNORM;
+
+ /* Connection-based need to check for termination and startup */
+ if (connection_based(sk)) {
+ if (sk->sk_state == TCP_CLOSE)
+ mask |= EPOLLHUP;
+ /* connection hasn't started yet? */
+ if (sk->sk_state == TCP_SYN_SENT)
+ return mask;
+ }
+
+ /* writable? */
+ if (sock_writeable(sk))
+ mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
+ else
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+ return mask;
+}
+EXPORT_SYMBOL(datagram_poll);
diff --git a/net/core/dev.c b/net/core/dev.c
new file mode 100644
index 000000000..42f6ff8b9
--- /dev/null
+++ b/net/core/dev.c
@@ -0,0 +1,9903 @@
+/*
+ * NET3 Protocol independent device support routines.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Derived from the non IP parts of dev.c 1.0.19
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *
+ * Additional Authors:
+ * Florian la Roche <rzsfl@rz.uni-sb.de>
+ * Alan Cox <gw4pts@gw4pts.ampr.org>
+ * David Hinds <dahinds@users.sourceforge.net>
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ * Adam Sulmicki <adam@cfar.umd.edu>
+ * Pekka Riikonen <priikone@poesidon.pspt.fi>
+ *
+ * Changes:
+ * D.J. Barrow : Fixed bug where dev->refcnt gets set
+ * to 2 if register_netdev gets called
+ * before net_dev_init & also removed a
+ * few lines of code in the process.
+ * Alan Cox : device private ioctl copies fields back.
+ * Alan Cox : Transmit queue code does relevant
+ * stunts to keep the queue safe.
+ * Alan Cox : Fixed double lock.
+ * Alan Cox : Fixed promisc NULL pointer trap
+ * ???????? : Support the full private ioctl range
+ * Alan Cox : Moved ioctl permission check into
+ * drivers
+ * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
+ * Alan Cox : 100 backlog just doesn't cut it when
+ * you start doing multicast video 8)
+ * Alan Cox : Rewrote net_bh and list manager.
+ * Alan Cox : Fix ETH_P_ALL echoback lengths.
+ * Alan Cox : Took out transmit every packet pass
+ * Saved a few bytes in the ioctl handler
+ * Alan Cox : Network driver sets packet type before
+ * calling netif_rx. Saves a function
+ * call a packet.
+ * Alan Cox : Hashed net_bh()
+ * Richard Kooijman: Timestamp fixes.
+ * Alan Cox : Wrong field in SIOCGIFDSTADDR
+ * Alan Cox : Device lock protection.
+ * Alan Cox : Fixed nasty side effect of device close
+ * changes.
+ * Rudi Cilibrasi : Pass the right thing to
+ * set_mac_address()
+ * Dave Miller : 32bit quantity for the device lock to
+ * make it work out on a Sparc.
+ * Bjorn Ekwall : Added KERNELD hack.
+ * Alan Cox : Cleaned up the backlog initialise.
+ * Craig Metz : SIOCGIFCONF fix if space for under
+ * 1 device.
+ * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
+ * is no device open function.
+ * Andi Kleen : Fix error reporting for SIOCGIFCONF
+ * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
+ * Cyrus Durgin : Cleaned for KMOD
+ * Adam Sulmicki : Bug Fix : Network Device Unload
+ * A network device unload needs to purge
+ * the backlog queue.
+ * Paul Rusty Russell : SIOCSIFNAME
+ * Pekka Riikonen : Netdev boot-time settings code
+ * Andrew Morton : Make unregister_netdevice wait
+ * indefinitely on dev->refcnt
+ * J Hadi Salim : - Backlog queue sampling
+ * - netif_rx() feedback
+ */
+
+#include <linux/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/skbuff.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/busy_poll.h>
+#include <linux/rtnetlink.h>
+#include <linux/stat.h>
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netpoll.h>
+#include <linux/rcupdate.h>
+#include <linux/delay.h>
+#include <net/iw_handler.h>
+#include <asm/current.h>
+#include <linux/audit.h>
+#include <linux/dmaengine.h>
+#include <linux/err.h>
+#include <linux/ctype.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <net/mpls.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <trace/events/napi.h>
+#include <trace/events/net.h>
+#include <trace/events/skb.h>
+#include <linux/pci.h>
+#include <linux/inetdevice.h>
+#include <linux/cpu_rmap.h>
+#include <linux/static_key.h>
+#include <linux/hashtable.h>
+#include <linux/vmalloc.h>
+#include <linux/if_macvlan.h>
+#include <linux/errqueue.h>
+#include <linux/hrtimer.h>
+#include <linux/netfilter_ingress.h>
+#include <linux/crash_dump.h>
+#include <linux/sctp.h>
+#include <net/udp_tunnel.h>
+#include <linux/net_namespace.h>
+
+#include "net-sysfs.h"
+
+#define MAX_GRO_SKBS 8
+#define MAX_NEST_DEV 8
+
+/* This should be increased if a protocol with a bigger head is added. */
+#define GRO_MAX_HEAD (MAX_HEADER + 128)
+
+static DEFINE_SPINLOCK(ptype_lock);
+static DEFINE_SPINLOCK(offload_lock);
+struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+struct list_head ptype_all __read_mostly; /* Taps */
+static struct list_head offload_base __read_mostly;
+
+static int netif_rx_internal(struct sk_buff *skb);
+static int call_netdevice_notifiers_info(unsigned long val,
+ struct netdev_notifier_info *info);
+static struct napi_struct *napi_by_id(unsigned int napi_id);
+
+/*
+ * The @dev_base_head list is protected by @dev_base_lock and the rtnl
+ * semaphore.
+ *
+ * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
+ *
+ * Writers must hold the rtnl semaphore while they loop through the
+ * dev_base_head list, and hold dev_base_lock for writing when they do the
+ * actual updates. This allows pure readers to access the list even
+ * while a writer is preparing to update it.
+ *
+ * To put it another way, dev_base_lock is held for writing only to
+ * protect against pure readers; the rtnl semaphore provides the
+ * protection against other writers.
+ *
+ * See, for example usages, register_netdevice() and
+ * unregister_netdevice(), which must be called with the rtnl
+ * semaphore held.
+ */
+DEFINE_RWLOCK(dev_base_lock);
+EXPORT_SYMBOL(dev_base_lock);
+
+static DEFINE_MUTEX(ifalias_mutex);
+
+/* protects napi_hash addition/deletion and napi_gen_id */
+static DEFINE_SPINLOCK(napi_hash_lock);
+
+static unsigned int napi_gen_id = NR_CPUS;
+static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
+
+static DECLARE_RWSEM(devnet_rename_sem);
+
+static inline void dev_base_seq_inc(struct net *net)
+{
+ while (++net->dev_base_seq == 0)
+ ;
+}
+
+static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
+{
+ unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
+
+ return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
+}
+
+static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
+{
+ return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
+}
+
+static inline void rps_lock(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ spin_lock(&sd->input_pkt_queue.lock);
+#endif
+}
+
+static inline void rps_unlock(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ spin_unlock(&sd->input_pkt_queue.lock);
+#endif
+}
+
+/* Device list insertion */
+static void list_netdevice(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+
+ ASSERT_RTNL();
+
+ write_lock_bh(&dev_base_lock);
+ list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
+ hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
+ hlist_add_head_rcu(&dev->index_hlist,
+ dev_index_hash(net, dev->ifindex));
+ write_unlock_bh(&dev_base_lock);
+
+ dev_base_seq_inc(net);
+}
+
+/* Device list removal
+ * caller must respect a RCU grace period before freeing/reusing dev
+ */
+static void unlist_netdevice(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ /* Unlink dev from the device chain */
+ write_lock_bh(&dev_base_lock);
+ list_del_rcu(&dev->dev_list);
+ hlist_del_rcu(&dev->name_hlist);
+ hlist_del_rcu(&dev->index_hlist);
+ write_unlock_bh(&dev_base_lock);
+
+ dev_base_seq_inc(dev_net(dev));
+}
+
+/*
+ * Our notifier list
+ */
+
+static RAW_NOTIFIER_HEAD(netdev_chain);
+
+/*
+ * Device drivers call our routines to queue packets here. We empty the
+ * queue in the local softnet handler.
+ */
+
+DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
+EXPORT_PER_CPU_SYMBOL(softnet_data);
+
+#ifdef CONFIG_LOCKDEP
+/*
+ * register_netdevice() inits txq->_xmit_lock and sets lockdep class
+ * according to dev->type
+ */
+static const unsigned short netdev_lock_type[] = {
+ ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
+ ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
+ ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
+ ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
+ ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
+ ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
+ ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
+ ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
+ ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
+ ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
+ ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
+ ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
+ ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
+ ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
+ ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
+
+static const char *const netdev_lock_name[] = {
+ "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
+ "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
+ "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
+ "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
+ "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
+ "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
+ "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
+ "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
+ "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
+ "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
+ "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
+ "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
+ "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
+ "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
+ "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
+
+static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
+static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
+
+static inline unsigned short netdev_lock_pos(unsigned short dev_type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
+ if (netdev_lock_type[i] == dev_type)
+ return i;
+ /* the last key is used by default */
+ return ARRAY_SIZE(netdev_lock_type) - 1;
+}
+
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+ unsigned short dev_type)
+{
+ int i;
+
+ i = netdev_lock_pos(dev_type);
+ lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
+ netdev_lock_name[i]);
+}
+
+static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
+{
+ int i;
+
+ i = netdev_lock_pos(dev->type);
+ lockdep_set_class_and_name(&dev->addr_list_lock,
+ &netdev_addr_lock_key[i],
+ netdev_lock_name[i]);
+}
+#else
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+ unsigned short dev_type)
+{
+}
+static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
+{
+}
+#endif
+
+/*******************************************************************************
+ *
+ * Protocol management and registration routines
+ *
+ *******************************************************************************/
+
+
+/*
+ * Add a protocol ID to the list. Now that the input handler is
+ * smarter we can dispense with all the messy stuff that used to be
+ * here.
+ *
+ * BEWARE!!! Protocol handlers, mangling input packets,
+ * MUST BE last in hash buckets and checking protocol handlers
+ * MUST start from promiscuous ptype_all chain in net_bh.
+ * It is true now, do not change it.
+ * Explanation follows: if protocol handler, mangling packet, will
+ * be the first on list, it is not able to sense, that packet
+ * is cloned and should be copied-on-write, so that it will
+ * change it and subsequent readers will get broken packet.
+ * --ANK (980803)
+ */
+
+static inline struct list_head *ptype_head(const struct packet_type *pt)
+{
+ if (pt->type == htons(ETH_P_ALL))
+ return pt->dev ? &pt->dev->ptype_all : &ptype_all;
+ else
+ return pt->dev ? &pt->dev->ptype_specific :
+ &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+}
+
+/**
+ * dev_add_pack - add packet handler
+ * @pt: packet type declaration
+ *
+ * Add a protocol handler to the networking stack. The passed &packet_type
+ * is linked into kernel lists and may not be freed until it has been
+ * removed from the kernel lists.
+ *
+ * This call does not sleep therefore it can not
+ * guarantee all CPU's that are in middle of receiving packets
+ * will see the new packet type (until the next received packet).
+ */
+
+void dev_add_pack(struct packet_type *pt)
+{
+ struct list_head *head = ptype_head(pt);
+
+ spin_lock(&ptype_lock);
+ list_add_rcu(&pt->list, head);
+ spin_unlock(&ptype_lock);
+}
+EXPORT_SYMBOL(dev_add_pack);
+
+/**
+ * __dev_remove_pack - remove packet handler
+ * @pt: packet type declaration
+ *
+ * Remove a protocol handler that was previously added to the kernel
+ * protocol handlers by dev_add_pack(). The passed &packet_type is removed
+ * from the kernel lists and can be freed or reused once this function
+ * returns.
+ *
+ * The packet type might still be in use by receivers
+ * and must not be freed until after all the CPU's have gone
+ * through a quiescent state.
+ */
+void __dev_remove_pack(struct packet_type *pt)
+{
+ struct list_head *head = ptype_head(pt);
+ struct packet_type *pt1;
+
+ spin_lock(&ptype_lock);
+
+ list_for_each_entry(pt1, head, list) {
+ if (pt == pt1) {
+ list_del_rcu(&pt->list);
+ goto out;
+ }
+ }
+
+ pr_warn("dev_remove_pack: %p not found\n", pt);
+out:
+ spin_unlock(&ptype_lock);
+}
+EXPORT_SYMBOL(__dev_remove_pack);
+
+/**
+ * dev_remove_pack - remove packet handler
+ * @pt: packet type declaration
+ *
+ * Remove a protocol handler that was previously added to the kernel
+ * protocol handlers by dev_add_pack(). The passed &packet_type is removed
+ * from the kernel lists and can be freed or reused once this function
+ * returns.
+ *
+ * This call sleeps to guarantee that no CPU is looking at the packet
+ * type after return.
+ */
+void dev_remove_pack(struct packet_type *pt)
+{
+ __dev_remove_pack(pt);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(dev_remove_pack);
+
+
+/**
+ * dev_add_offload - register offload handlers
+ * @po: protocol offload declaration
+ *
+ * Add protocol offload handlers to the networking stack. The passed
+ * &proto_offload is linked into kernel lists and may not be freed until
+ * it has been removed from the kernel lists.
+ *
+ * This call does not sleep therefore it can not
+ * guarantee all CPU's that are in middle of receiving packets
+ * will see the new offload handlers (until the next received packet).
+ */
+void dev_add_offload(struct packet_offload *po)
+{
+ struct packet_offload *elem;
+
+ spin_lock(&offload_lock);
+ list_for_each_entry(elem, &offload_base, list) {
+ if (po->priority < elem->priority)
+ break;
+ }
+ list_add_rcu(&po->list, elem->list.prev);
+ spin_unlock(&offload_lock);
+}
+EXPORT_SYMBOL(dev_add_offload);
+
+/**
+ * __dev_remove_offload - remove offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a protocol offload handler that was previously added to the
+ * kernel offload handlers by dev_add_offload(). The passed &offload_type
+ * is removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * The packet type might still be in use by receivers
+ * and must not be freed until after all the CPU's have gone
+ * through a quiescent state.
+ */
+static void __dev_remove_offload(struct packet_offload *po)
+{
+ struct list_head *head = &offload_base;
+ struct packet_offload *po1;
+
+ spin_lock(&offload_lock);
+
+ list_for_each_entry(po1, head, list) {
+ if (po == po1) {
+ list_del_rcu(&po->list);
+ goto out;
+ }
+ }
+
+ pr_warn("dev_remove_offload: %p not found\n", po);
+out:
+ spin_unlock(&offload_lock);
+}
+
+/**
+ * dev_remove_offload - remove packet offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a packet offload handler that was previously added to the kernel
+ * offload handlers by dev_add_offload(). The passed &offload_type is
+ * removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * This call sleeps to guarantee that no CPU is looking at the packet
+ * type after return.
+ */
+void dev_remove_offload(struct packet_offload *po)
+{
+ __dev_remove_offload(po);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(dev_remove_offload);
+
+/******************************************************************************
+ *
+ * Device Boot-time Settings Routines
+ *
+ ******************************************************************************/
+
+/* Boot time configuration table */
+static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
+
+/**
+ * netdev_boot_setup_add - add new setup entry
+ * @name: name of the device
+ * @map: configured settings for the device
+ *
+ * Adds new setup entry to the dev_boot_setup list. The function
+ * returns 0 on error and 1 on success. This is a generic routine to
+ * all netdevices.
+ */
+static int netdev_boot_setup_add(char *name, struct ifmap *map)
+{
+ struct netdev_boot_setup *s;
+ int i;
+
+ s = dev_boot_setup;
+ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
+ if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
+ memset(s[i].name, 0, sizeof(s[i].name));
+ strlcpy(s[i].name, name, IFNAMSIZ);
+ memcpy(&s[i].map, map, sizeof(s[i].map));
+ break;
+ }
+ }
+
+ return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
+}
+
+/**
+ * netdev_boot_setup_check - check boot time settings
+ * @dev: the netdevice
+ *
+ * Check boot time settings for the device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found, 1 if they are.
+ */
+int netdev_boot_setup_check(struct net_device *dev)
+{
+ struct netdev_boot_setup *s = dev_boot_setup;
+ int i;
+
+ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
+ if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
+ !strcmp(dev->name, s[i].name)) {
+ dev->irq = s[i].map.irq;
+ dev->base_addr = s[i].map.base_addr;
+ dev->mem_start = s[i].map.mem_start;
+ dev->mem_end = s[i].map.mem_end;
+ return 1;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL(netdev_boot_setup_check);
+
+
+/**
+ * netdev_boot_base - get address from boot time settings
+ * @prefix: prefix for network device
+ * @unit: id for network device
+ *
+ * Check boot time settings for the base address of device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found.
+ */
+unsigned long netdev_boot_base(const char *prefix, int unit)
+{
+ const struct netdev_boot_setup *s = dev_boot_setup;
+ char name[IFNAMSIZ];
+ int i;
+
+ sprintf(name, "%s%d", prefix, unit);
+
+ /*
+ * If device already registered then return base of 1
+ * to indicate not to probe for this interface
+ */
+ if (__dev_get_by_name(&init_net, name))
+ return 1;
+
+ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
+ if (!strcmp(name, s[i].name))
+ return s[i].map.base_addr;
+ return 0;
+}
+
+/*
+ * Saves at boot time configured settings for any netdevice.
+ */
+int __init netdev_boot_setup(char *str)
+{
+ int ints[5];
+ struct ifmap map;
+
+ str = get_options(str, ARRAY_SIZE(ints), ints);
+ if (!str || !*str)
+ return 0;
+
+ /* Save settings */
+ memset(&map, 0, sizeof(map));
+ if (ints[0] > 0)
+ map.irq = ints[1];
+ if (ints[0] > 1)
+ map.base_addr = ints[2];
+ if (ints[0] > 2)
+ map.mem_start = ints[3];
+ if (ints[0] > 3)
+ map.mem_end = ints[4];
+
+ /* Add new entry to the list */
+ return netdev_boot_setup_add(str, &map);
+}
+
+__setup("netdev=", netdev_boot_setup);
+
+/*******************************************************************************
+ *
+ * Device Interface Subroutines
+ *
+ *******************************************************************************/
+
+/**
+ * dev_get_iflink - get 'iflink' value of a interface
+ * @dev: targeted interface
+ *
+ * Indicates the ifindex the interface is linked to.
+ * Physical interfaces have the same 'ifindex' and 'iflink' values.
+ */
+
+int dev_get_iflink(const struct net_device *dev)
+{
+ if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
+ return dev->netdev_ops->ndo_get_iflink(dev);
+
+ return dev->ifindex;
+}
+EXPORT_SYMBOL(dev_get_iflink);
+
+/**
+ * dev_fill_metadata_dst - Retrieve tunnel egress information.
+ * @dev: targeted interface
+ * @skb: The packet.
+ *
+ * For better visibility of tunnel traffic OVS needs to retrieve
+ * egress tunnel information for a packet. Following API allows
+ * user to get this info.
+ */
+int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+{
+ struct ip_tunnel_info *info;
+
+ if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
+ return -EINVAL;
+
+ info = skb_tunnel_info_unclone(skb);
+ if (!info)
+ return -ENOMEM;
+ if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
+ return -EINVAL;
+
+ return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
+}
+EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
+
+/**
+ * __dev_get_by_name - find a device by its name
+ * @net: the applicable net namespace
+ * @name: name to find
+ *
+ * Find an interface by name. Must be called under RTNL semaphore
+ * or @dev_base_lock. If the name is found a pointer to the device
+ * is returned. If the name is not found then %NULL is returned. The
+ * reference counters are not incremented so the caller must be
+ * careful with locks.
+ */
+
+struct net_device *__dev_get_by_name(struct net *net, const char *name)
+{
+ struct net_device *dev;
+ struct hlist_head *head = dev_name_hash(net, name);
+
+ hlist_for_each_entry(dev, head, name_hlist)
+ if (!strncmp(dev->name, name, IFNAMSIZ))
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(__dev_get_by_name);
+
+/**
+ * dev_get_by_name_rcu - find a device by its name
+ * @net: the applicable net namespace
+ * @name: name to find
+ *
+ * Find an interface by name.
+ * If the name is found a pointer to the device is returned.
+ * If the name is not found then %NULL is returned.
+ * The reference counters are not incremented so the caller must be
+ * careful with locks. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
+{
+ struct net_device *dev;
+ struct hlist_head *head = dev_name_hash(net, name);
+
+ hlist_for_each_entry_rcu(dev, head, name_hlist)
+ if (!strncmp(dev->name, name, IFNAMSIZ))
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(dev_get_by_name_rcu);
+
+/**
+ * dev_get_by_name - find a device by its name
+ * @net: the applicable net namespace
+ * @name: name to find
+ *
+ * Find an interface by name. This can be called from any
+ * context and does its own locking. The returned handle has
+ * the usage count incremented and the caller must use dev_put() to
+ * release it when it is no longer needed. %NULL is returned if no
+ * matching device is found.
+ */
+
+struct net_device *dev_get_by_name(struct net *net, const char *name)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(net, name);
+ if (dev)
+ dev_hold(dev);
+ rcu_read_unlock();
+ return dev;
+}
+EXPORT_SYMBOL(dev_get_by_name);
+
+/**
+ * __dev_get_by_index - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. Returns %NULL if the device
+ * is not found or a pointer to the device. The device has not
+ * had its reference counter increased so the caller must be careful
+ * about locking. The caller must hold either the RTNL semaphore
+ * or @dev_base_lock.
+ */
+
+struct net_device *__dev_get_by_index(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+ struct hlist_head *head = dev_index_hash(net, ifindex);
+
+ hlist_for_each_entry(dev, head, index_hlist)
+ if (dev->ifindex == ifindex)
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(__dev_get_by_index);
+
+/**
+ * dev_get_by_index_rcu - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. Returns %NULL if the device
+ * is not found or a pointer to the device. The device has not
+ * had its reference counter increased so the caller must be careful
+ * about locking. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+ struct hlist_head *head = dev_index_hash(net, ifindex);
+
+ hlist_for_each_entry_rcu(dev, head, index_hlist)
+ if (dev->ifindex == ifindex)
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(dev_get_by_index_rcu);
+
+
+/**
+ * dev_get_by_index - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. Returns NULL if the device
+ * is not found or a pointer to the device. The device returned has
+ * had a reference added and the pointer is safe until the user calls
+ * dev_put to indicate they have finished with it.
+ */
+
+struct net_device *dev_get_by_index(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (dev)
+ dev_hold(dev);
+ rcu_read_unlock();
+ return dev;
+}
+EXPORT_SYMBOL(dev_get_by_index);
+
+/**
+ * dev_get_by_napi_id - find a device by napi_id
+ * @napi_id: ID of the NAPI struct
+ *
+ * Search for an interface by NAPI ID. Returns %NULL if the device
+ * is not found or a pointer to the device. The device has not had
+ * its reference counter increased so the caller must be careful
+ * about locking. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_napi_id(unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (napi_id < MIN_NAPI_ID)
+ return NULL;
+
+ napi = napi_by_id(napi_id);
+
+ return napi ? napi->dev : NULL;
+}
+EXPORT_SYMBOL(dev_get_by_napi_id);
+
+/**
+ * netdev_get_name - get a netdevice name, knowing its ifindex.
+ * @net: network namespace
+ * @name: a pointer to the buffer where the name will be stored.
+ * @ifindex: the ifindex of the interface to get the name from.
+ */
+int netdev_get_name(struct net *net, char *name, int ifindex)
+{
+ struct net_device *dev;
+ int ret;
+
+ down_read(&devnet_rename_sem);
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ strcpy(name, dev->name);
+
+ ret = 0;
+out:
+ rcu_read_unlock();
+ up_read(&devnet_rename_sem);
+ return ret;
+}
+
+/**
+ * dev_getbyhwaddr_rcu - find a device by its hardware address
+ * @net: the applicable net namespace
+ * @type: media type of device
+ * @ha: hardware address
+ *
+ * Search for an interface by MAC address. Returns NULL if the device
+ * is not found or a pointer to the device.
+ * The caller must hold RCU or RTNL.
+ * The returned device has not had its ref count increased
+ * and the caller must therefore be careful about locking
+ *
+ */
+
+struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
+ const char *ha)
+{
+ struct net_device *dev;
+
+ for_each_netdev_rcu(net, dev)
+ if (dev->type == type &&
+ !memcmp(dev->dev_addr, ha, dev->addr_len))
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
+
+struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
+{
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+ for_each_netdev(net, dev)
+ if (dev->type == type)
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(__dev_getfirstbyhwtype);
+
+struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
+{
+ struct net_device *dev, *ret = NULL;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev)
+ if (dev->type == type) {
+ dev_hold(dev);
+ ret = dev;
+ break;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(dev_getfirstbyhwtype);
+
+/**
+ * __dev_get_by_flags - find any device with given flags
+ * @net: the applicable net namespace
+ * @if_flags: IFF_* values
+ * @mask: bitmask of bits in if_flags to check
+ *
+ * Search for any interface with the given flags. Returns NULL if a device
+ * is not found or a pointer to the device. Must be called inside
+ * rtnl_lock(), and result refcount is unchanged.
+ */
+
+struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
+ unsigned short mask)
+{
+ struct net_device *dev, *ret;
+
+ ASSERT_RTNL();
+
+ ret = NULL;
+ for_each_netdev(net, dev) {
+ if (((dev->flags ^ if_flags) & mask) == 0) {
+ ret = dev;
+ break;
+ }
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__dev_get_by_flags);
+
+/**
+ * dev_valid_name - check if name is okay for network device
+ * @name: name string
+ *
+ * Network device names need to be valid file names to
+ * to allow sysfs to work. We also disallow any kind of
+ * whitespace.
+ */
+bool dev_valid_name(const char *name)
+{
+ if (*name == '\0')
+ return false;
+ if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
+ return false;
+ if (!strcmp(name, ".") || !strcmp(name, ".."))
+ return false;
+
+ while (*name) {
+ if (*name == '/' || *name == ':' || isspace(*name))
+ return false;
+ name++;
+ }
+ return true;
+}
+EXPORT_SYMBOL(dev_valid_name);
+
+/**
+ * __dev_alloc_name - allocate a name for a device
+ * @net: network namespace to allocate the device name in
+ * @name: name format string
+ * @buf: scratch buffer and result name string
+ *
+ * Passed a format string - eg "lt%d" it will try and find a suitable
+ * id. It scans list of devices to build up a free map, then chooses
+ * the first empty slot. The caller must hold the dev_base or rtnl lock
+ * while allocating the name and adding the device in order to avoid
+ * duplicates.
+ * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
+ * Returns the number of the unit assigned or a negative errno code.
+ */
+
+static int __dev_alloc_name(struct net *net, const char *name, char *buf)
+{
+ int i = 0;
+ const char *p;
+ const int max_netdevices = 8*PAGE_SIZE;
+ unsigned long *inuse;
+ struct net_device *d;
+
+ if (!dev_valid_name(name))
+ return -EINVAL;
+
+ p = strchr(name, '%');
+ if (p) {
+ /*
+ * Verify the string as this thing may have come from
+ * the user. There must be either one "%d" and no other "%"
+ * characters.
+ */
+ if (p[1] != 'd' || strchr(p + 2, '%'))
+ return -EINVAL;
+
+ /* Use one page as a bit array of possible slots */
+ inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
+ if (!inuse)
+ return -ENOMEM;
+
+ for_each_netdev(net, d) {
+ if (!sscanf(d->name, name, &i))
+ continue;
+ if (i < 0 || i >= max_netdevices)
+ continue;
+
+ /* avoid cases where sscanf is not exact inverse of printf */
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!strncmp(buf, d->name, IFNAMSIZ))
+ set_bit(i, inuse);
+ }
+
+ i = find_first_zero_bit(inuse, max_netdevices);
+ free_page((unsigned long) inuse);
+ }
+
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!__dev_get_by_name(net, buf))
+ return i;
+
+ /* It is possible to run out of possible slots
+ * when the name is long and there isn't enough space left
+ * for the digits, or if all bits are used.
+ */
+ return -ENFILE;
+}
+
+static int dev_alloc_name_ns(struct net *net,
+ struct net_device *dev,
+ const char *name)
+{
+ char buf[IFNAMSIZ];
+ int ret;
+
+ BUG_ON(!net);
+ ret = __dev_alloc_name(net, name, buf);
+ if (ret >= 0)
+ strlcpy(dev->name, buf, IFNAMSIZ);
+ return ret;
+}
+
+/**
+ * dev_alloc_name - allocate a name for a device
+ * @dev: device
+ * @name: name format string
+ *
+ * Passed a format string - eg "lt%d" it will try and find a suitable
+ * id. It scans list of devices to build up a free map, then chooses
+ * the first empty slot. The caller must hold the dev_base or rtnl lock
+ * while allocating the name and adding the device in order to avoid
+ * duplicates.
+ * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
+ * Returns the number of the unit assigned or a negative errno code.
+ */
+
+int dev_alloc_name(struct net_device *dev, const char *name)
+{
+ return dev_alloc_name_ns(dev_net(dev), dev, name);
+}
+EXPORT_SYMBOL(dev_alloc_name);
+
+int dev_get_valid_name(struct net *net, struct net_device *dev,
+ const char *name)
+{
+ BUG_ON(!net);
+
+ if (!dev_valid_name(name))
+ return -EINVAL;
+
+ if (strchr(name, '%'))
+ return dev_alloc_name_ns(net, dev, name);
+ else if (__dev_get_by_name(net, name))
+ return -EEXIST;
+ else if (dev->name != name)
+ strlcpy(dev->name, name, IFNAMSIZ);
+
+ return 0;
+}
+EXPORT_SYMBOL(dev_get_valid_name);
+
+/**
+ * dev_change_name - change name of a device
+ * @dev: device
+ * @newname: name (or format string) must be at least IFNAMSIZ
+ *
+ * Change name of a device, can pass format strings "eth%d".
+ * for wildcarding.
+ */
+int dev_change_name(struct net_device *dev, const char *newname)
+{
+ unsigned char old_assign_type;
+ char oldname[IFNAMSIZ];
+ int err = 0;
+ int ret;
+ struct net *net;
+
+ ASSERT_RTNL();
+ BUG_ON(!dev_net(dev));
+
+ net = dev_net(dev);
+
+ /* Some auto-enslaved devices e.g. failover slaves are
+ * special, as userspace might rename the device after
+ * the interface had been brought up and running since
+ * the point kernel initiated auto-enslavement. Allow
+ * live name change even when these slave devices are
+ * up and running.
+ *
+ * Typically, users of these auto-enslaving devices
+ * don't actually care about slave name change, as
+ * they are supposed to operate on master interface
+ * directly.
+ */
+ if (dev->flags & IFF_UP &&
+ likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
+ return -EBUSY;
+
+ down_write(&devnet_rename_sem);
+
+ if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
+ up_write(&devnet_rename_sem);
+ return 0;
+ }
+
+ memcpy(oldname, dev->name, IFNAMSIZ);
+
+ err = dev_get_valid_name(net, dev, newname);
+ if (err < 0) {
+ up_write(&devnet_rename_sem);
+ return err;
+ }
+
+ if (oldname[0] && !strchr(oldname, '%'))
+ netdev_info(dev, "renamed from %s\n", oldname);
+
+ old_assign_type = dev->name_assign_type;
+ dev->name_assign_type = NET_NAME_RENAMED;
+
+rollback:
+ ret = device_rename(&dev->dev, dev->name);
+ if (ret) {
+ memcpy(dev->name, oldname, IFNAMSIZ);
+ dev->name_assign_type = old_assign_type;
+ up_write(&devnet_rename_sem);
+ return ret;
+ }
+
+ up_write(&devnet_rename_sem);
+
+ netdev_adjacent_rename_links(dev, oldname);
+
+ write_lock_bh(&dev_base_lock);
+ hlist_del_rcu(&dev->name_hlist);
+ write_unlock_bh(&dev_base_lock);
+
+ synchronize_rcu();
+
+ write_lock_bh(&dev_base_lock);
+ hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
+ write_unlock_bh(&dev_base_lock);
+
+ ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
+ ret = notifier_to_errno(ret);
+
+ if (ret) {
+ /* err >= 0 after dev_alloc_name() or stores the first errno */
+ if (err >= 0) {
+ err = ret;
+ down_write(&devnet_rename_sem);
+ memcpy(dev->name, oldname, IFNAMSIZ);
+ memcpy(oldname, newname, IFNAMSIZ);
+ dev->name_assign_type = old_assign_type;
+ old_assign_type = NET_NAME_RENAMED;
+ goto rollback;
+ } else {
+ pr_err("%s: name change rollback failed: %d\n",
+ dev->name, ret);
+ }
+ }
+
+ return err;
+}
+
+/**
+ * dev_set_alias - change ifalias of a device
+ * @dev: device
+ * @alias: name up to IFALIASZ
+ * @len: limit of bytes to copy from info
+ *
+ * Set ifalias for a device,
+ */
+int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
+{
+ struct dev_ifalias *new_alias = NULL;
+
+ if (len >= IFALIASZ)
+ return -EINVAL;
+
+ if (len) {
+ new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
+ if (!new_alias)
+ return -ENOMEM;
+
+ memcpy(new_alias->ifalias, alias, len);
+ new_alias->ifalias[len] = 0;
+ }
+
+ mutex_lock(&ifalias_mutex);
+ rcu_swap_protected(dev->ifalias, new_alias,
+ mutex_is_locked(&ifalias_mutex));
+ mutex_unlock(&ifalias_mutex);
+
+ if (new_alias)
+ kfree_rcu(new_alias, rcuhead);
+
+ return len;
+}
+EXPORT_SYMBOL(dev_set_alias);
+
+/**
+ * dev_get_alias - get ifalias of a device
+ * @dev: device
+ * @name: buffer to store name of ifalias
+ * @len: size of buffer
+ *
+ * get ifalias for a device. Caller must make sure dev cannot go
+ * away, e.g. rcu read lock or own a reference count to device.
+ */
+int dev_get_alias(const struct net_device *dev, char *name, size_t len)
+{
+ const struct dev_ifalias *alias;
+ int ret = 0;
+
+ rcu_read_lock();
+ alias = rcu_dereference(dev->ifalias);
+ if (alias)
+ ret = snprintf(name, len, "%s", alias->ifalias);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/**
+ * netdev_features_change - device changes features
+ * @dev: device to cause notification
+ *
+ * Called to indicate a device has changed features.
+ */
+void netdev_features_change(struct net_device *dev)
+{
+ call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
+}
+EXPORT_SYMBOL(netdev_features_change);
+
+/**
+ * netdev_state_change - device changes state
+ * @dev: device to cause notification
+ *
+ * Called to indicate a device has changed state. This function calls
+ * the notifier chains for netdev_chain and sends a NEWLINK message
+ * to the routing socket.
+ */
+void netdev_state_change(struct net_device *dev)
+{
+ if (dev->flags & IFF_UP) {
+ struct netdev_notifier_change_info change_info = {
+ .info.dev = dev,
+ };
+
+ call_netdevice_notifiers_info(NETDEV_CHANGE,
+ &change_info.info);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
+ }
+}
+EXPORT_SYMBOL(netdev_state_change);
+
+/**
+ * netdev_notify_peers - notify network peers about existence of @dev
+ * @dev: network device
+ *
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
+ */
+void netdev_notify_peers(struct net_device *dev)
+{
+ rtnl_lock();
+ call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+ call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(netdev_notify_peers);
+
+static int __dev_open(struct net_device *dev)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int ret;
+
+ ASSERT_RTNL();
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ /* Block netpoll from trying to do any rx path servicing.
+ * If we don't do this there is a chance ndo_poll_controller
+ * or ndo_poll may be running while we open the device
+ */
+ netpoll_poll_disable(dev);
+
+ ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ return ret;
+
+ set_bit(__LINK_STATE_START, &dev->state);
+
+ if (ops->ndo_validate_addr)
+ ret = ops->ndo_validate_addr(dev);
+
+ if (!ret && ops->ndo_open)
+ ret = ops->ndo_open(dev);
+
+ netpoll_poll_enable(dev);
+
+ if (ret)
+ clear_bit(__LINK_STATE_START, &dev->state);
+ else {
+ dev->flags |= IFF_UP;
+ dev_set_rx_mode(dev);
+ dev_activate(dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
+ }
+
+ return ret;
+}
+
+/**
+ * dev_open - prepare an interface for use.
+ * @dev: device to open
+ *
+ * Takes a device from down to up state. The device's private open
+ * function is invoked and then the multicast lists are loaded. Finally
+ * the device is moved into the up state and a %NETDEV_UP message is
+ * sent to the netdev notifier chain.
+ *
+ * Calling this function on an active interface is a nop. On a failure
+ * a negative errno code is returned.
+ */
+int dev_open(struct net_device *dev)
+{
+ int ret;
+
+ if (dev->flags & IFF_UP)
+ return 0;
+
+ ret = __dev_open(dev);
+ if (ret < 0)
+ return ret;
+
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
+ call_netdevice_notifiers(NETDEV_UP, dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_open);
+
+static void __dev_close_many(struct list_head *head)
+{
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+ might_sleep();
+
+ list_for_each_entry(dev, head, close_list) {
+ /* Temporarily disable netpoll until the interface is down */
+ netpoll_poll_disable(dev);
+
+ call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
+
+ clear_bit(__LINK_STATE_START, &dev->state);
+
+ /* Synchronize to scheduled poll. We cannot touch poll list, it
+ * can be even on different cpu. So just clear netif_running().
+ *
+ * dev->stop() will invoke napi_disable() on all of it's
+ * napi_struct instances on this device.
+ */
+ smp_mb__after_atomic(); /* Commit netif_running(). */
+ }
+
+ dev_deactivate_many(head);
+
+ list_for_each_entry(dev, head, close_list) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ /*
+ * Call the device specific close. This cannot fail.
+ * Only if device is UP
+ *
+ * We allow it to be called even after a DETACH hot-plug
+ * event.
+ */
+ if (ops->ndo_stop)
+ ops->ndo_stop(dev);
+
+ dev->flags &= ~IFF_UP;
+ netpoll_poll_enable(dev);
+ }
+}
+
+static void __dev_close(struct net_device *dev)
+{
+ LIST_HEAD(single);
+
+ list_add(&dev->close_list, &single);
+ __dev_close_many(&single);
+ list_del(&single);
+}
+
+void dev_close_many(struct list_head *head, bool unlink)
+{
+ struct net_device *dev, *tmp;
+
+ /* Remove the devices that don't need to be closed */
+ list_for_each_entry_safe(dev, tmp, head, close_list)
+ if (!(dev->flags & IFF_UP))
+ list_del_init(&dev->close_list);
+
+ __dev_close_many(head);
+
+ list_for_each_entry_safe(dev, tmp, head, close_list) {
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
+ call_netdevice_notifiers(NETDEV_DOWN, dev);
+ if (unlink)
+ list_del_init(&dev->close_list);
+ }
+}
+EXPORT_SYMBOL(dev_close_many);
+
+/**
+ * dev_close - shutdown an interface.
+ * @dev: device to shutdown
+ *
+ * This function moves an active device into down state. A
+ * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
+ * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
+ * chain.
+ */
+void dev_close(struct net_device *dev)
+{
+ if (dev->flags & IFF_UP) {
+ LIST_HEAD(single);
+
+ list_add(&dev->close_list, &single);
+ dev_close_many(&single, true);
+ list_del(&single);
+ }
+}
+EXPORT_SYMBOL(dev_close);
+
+
+/**
+ * dev_disable_lro - disable Large Receive Offload on a device
+ * @dev: device
+ *
+ * Disable Large Receive Offload (LRO) on a net device. Must be
+ * called under RTNL. This is needed if received packets may be
+ * forwarded to another interface.
+ */
+void dev_disable_lro(struct net_device *dev)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ dev->wanted_features &= ~NETIF_F_LRO;
+ netdev_update_features(dev);
+
+ if (unlikely(dev->features & NETIF_F_LRO))
+ netdev_WARN(dev, "failed to disable LRO!\n");
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter)
+ dev_disable_lro(lower_dev);
+}
+EXPORT_SYMBOL(dev_disable_lro);
+
+/**
+ * dev_disable_gro_hw - disable HW Generic Receive Offload on a device
+ * @dev: device
+ *
+ * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be
+ * called under RTNL. This is needed if Generic XDP is installed on
+ * the device.
+ */
+static void dev_disable_gro_hw(struct net_device *dev)
+{
+ dev->wanted_features &= ~NETIF_F_GRO_HW;
+ netdev_update_features(dev);
+
+ if (unlikely(dev->features & NETIF_F_GRO_HW))
+ netdev_WARN(dev, "failed to disable GRO_HW!\n");
+}
+
+const char *netdev_cmd_to_name(enum netdev_cmd cmd)
+{
+#define N(val) \
+ case NETDEV_##val: \
+ return "NETDEV_" __stringify(val);
+ switch (cmd) {
+ N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
+ N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
+ N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
+ N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
+ N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
+ N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
+ N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
+ N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
+ N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
+ }
+#undef N
+ return "UNKNOWN_NETDEV_EVENT";
+}
+EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
+
+static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
+ struct net_device *dev)
+{
+ struct netdev_notifier_info info = {
+ .dev = dev,
+ };
+
+ return nb->notifier_call(nb, val, &info);
+}
+
+static int dev_boot_phase = 1;
+
+/**
+ * register_netdevice_notifier - register a network notifier block
+ * @nb: notifier
+ *
+ * Register a notifier to be called when network device events occur.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
+ *
+ * When registered all registration and up events are replayed
+ * to the new notifier to allow device to have a race free
+ * view of the network device list.
+ */
+
+int register_netdevice_notifier(struct notifier_block *nb)
+{
+ struct net_device *dev;
+ struct net_device *last;
+ struct net *net;
+ int err;
+
+ /* Close race with setup_net() and cleanup_net() */
+ down_write(&pernet_ops_rwsem);
+ rtnl_lock();
+ err = raw_notifier_chain_register(&netdev_chain, nb);
+ if (err)
+ goto unlock;
+ if (dev_boot_phase)
+ goto unlock;
+ for_each_net(net) {
+ for_each_netdev(net, dev) {
+ err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
+ err = notifier_to_errno(err);
+ if (err)
+ goto rollback;
+
+ if (!(dev->flags & IFF_UP))
+ continue;
+
+ call_netdevice_notifier(nb, NETDEV_UP, dev);
+ }
+ }
+
+unlock:
+ rtnl_unlock();
+ up_write(&pernet_ops_rwsem);
+ return err;
+
+rollback:
+ last = dev;
+ for_each_net(net) {
+ for_each_netdev(net, dev) {
+ if (dev == last)
+ goto outroll;
+
+ if (dev->flags & IFF_UP) {
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+ dev);
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
+ }
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
+ }
+ }
+
+outroll:
+ raw_notifier_chain_unregister(&netdev_chain, nb);
+ goto unlock;
+}
+EXPORT_SYMBOL(register_netdevice_notifier);
+
+/**
+ * unregister_netdevice_notifier - unregister a network notifier block
+ * @nb: notifier
+ *
+ * Unregister a notifier previously registered by
+ * register_netdevice_notifier(). The notifier is unlinked into the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
+ *
+ * After unregistering unregister and down device events are synthesized
+ * for all devices on the device list to the removed notifier to remove
+ * the need for special case cleanup code.
+ */
+
+int unregister_netdevice_notifier(struct notifier_block *nb)
+{
+ struct net_device *dev;
+ struct net *net;
+ int err;
+
+ /* Close race with setup_net() and cleanup_net() */
+ down_write(&pernet_ops_rwsem);
+ rtnl_lock();
+ err = raw_notifier_chain_unregister(&netdev_chain, nb);
+ if (err)
+ goto unlock;
+
+ for_each_net(net) {
+ for_each_netdev(net, dev) {
+ if (dev->flags & IFF_UP) {
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+ dev);
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
+ }
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
+ }
+ }
+unlock:
+ rtnl_unlock();
+ up_write(&pernet_ops_rwsem);
+ return err;
+}
+EXPORT_SYMBOL(unregister_netdevice_notifier);
+
+/**
+ * call_netdevice_notifiers_info - call all network notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @info: notifier information data
+ *
+ * Call all network notifier blocks. Parameters and return value
+ * are as for raw_notifier_call_chain().
+ */
+
+static int call_netdevice_notifiers_info(unsigned long val,
+ struct netdev_notifier_info *info)
+{
+ ASSERT_RTNL();
+ return raw_notifier_call_chain(&netdev_chain, val, info);
+}
+
+/**
+ * call_netdevice_notifiers - call all network notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @dev: net_device pointer passed unmodified to notifier function
+ *
+ * Call all network notifier blocks. Parameters and return value
+ * are as for raw_notifier_call_chain().
+ */
+
+int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
+{
+ struct netdev_notifier_info info = {
+ .dev = dev,
+ };
+
+ return call_netdevice_notifiers_info(val, &info);
+}
+EXPORT_SYMBOL(call_netdevice_notifiers);
+
+/**
+ * call_netdevice_notifiers_mtu - call all network notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @dev: net_device pointer passed unmodified to notifier function
+ * @arg: additional u32 argument passed to the notifier function
+ *
+ * Call all network notifier blocks. Parameters and return value
+ * are as for raw_notifier_call_chain().
+ */
+static int call_netdevice_notifiers_mtu(unsigned long val,
+ struct net_device *dev, u32 arg)
+{
+ struct netdev_notifier_info_ext info = {
+ .info.dev = dev,
+ .ext.mtu = arg,
+ };
+
+ BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
+
+ return call_netdevice_notifiers_info(val, &info.info);
+}
+
+#ifdef CONFIG_NET_INGRESS
+static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
+
+void net_inc_ingress_queue(void)
+{
+ static_branch_inc(&ingress_needed_key);
+}
+EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
+
+void net_dec_ingress_queue(void)
+{
+ static_branch_dec(&ingress_needed_key);
+}
+EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
+#endif
+
+#ifdef CONFIG_NET_EGRESS
+static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
+
+void net_inc_egress_queue(void)
+{
+ static_branch_inc(&egress_needed_key);
+}
+EXPORT_SYMBOL_GPL(net_inc_egress_queue);
+
+void net_dec_egress_queue(void)
+{
+ static_branch_dec(&egress_needed_key);
+}
+EXPORT_SYMBOL_GPL(net_dec_egress_queue);
+#endif
+
+static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
+#ifdef CONFIG_JUMP_LABEL
+static atomic_t netstamp_needed_deferred;
+static atomic_t netstamp_wanted;
+static void netstamp_clear(struct work_struct *work)
+{
+ int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
+ int wanted;
+
+ wanted = atomic_add_return(deferred, &netstamp_wanted);
+ if (wanted > 0)
+ static_branch_enable(&netstamp_needed_key);
+ else
+ static_branch_disable(&netstamp_needed_key);
+}
+static DECLARE_WORK(netstamp_work, netstamp_clear);
+#endif
+
+void net_enable_timestamp(void)
+{
+#ifdef CONFIG_JUMP_LABEL
+ int wanted;
+
+ while (1) {
+ wanted = atomic_read(&netstamp_wanted);
+ if (wanted <= 0)
+ break;
+ if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
+ return;
+ }
+ atomic_inc(&netstamp_needed_deferred);
+ schedule_work(&netstamp_work);
+#else
+ static_branch_inc(&netstamp_needed_key);
+#endif
+}
+EXPORT_SYMBOL(net_enable_timestamp);
+
+void net_disable_timestamp(void)
+{
+#ifdef CONFIG_JUMP_LABEL
+ int wanted;
+
+ while (1) {
+ wanted = atomic_read(&netstamp_wanted);
+ if (wanted <= 1)
+ break;
+ if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
+ return;
+ }
+ atomic_dec(&netstamp_needed_deferred);
+ schedule_work(&netstamp_work);
+#else
+ static_branch_dec(&netstamp_needed_key);
+#endif
+}
+EXPORT_SYMBOL(net_disable_timestamp);
+
+static inline void net_timestamp_set(struct sk_buff *skb)
+{
+ skb->tstamp = 0;
+ if (static_branch_unlikely(&netstamp_needed_key))
+ __net_timestamp(skb);
+}
+
+#define net_timestamp_check(COND, SKB) \
+ if (static_branch_unlikely(&netstamp_needed_key)) { \
+ if ((COND) && !(SKB)->tstamp) \
+ __net_timestamp(SKB); \
+ } \
+
+bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
+{
+ unsigned int len;
+
+ if (!(dev->flags & IFF_UP))
+ return false;
+
+ len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
+ if (skb->len <= len)
+ return true;
+
+ /* if TSO is enabled, we don't care about the length as the packet
+ * could be forwarded without being segmented before
+ */
+ if (skb_is_gso(skb))
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(is_skb_forwardable);
+
+int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ int ret = ____dev_forward_skb(dev, skb);
+
+ if (likely(!ret)) {
+ skb->protocol = eth_type_trans(skb, dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__dev_forward_skb);
+
+/**
+ * dev_forward_skb - loopback an skb to another netif
+ *
+ * @dev: destination network device
+ * @skb: buffer to forward
+ *
+ * return values:
+ * NET_RX_SUCCESS (no congestion)
+ * NET_RX_DROP (packet was dropped, but freed)
+ *
+ * dev_forward_skb can be used for injecting an skb from the
+ * start_xmit function of one device into the receive queue
+ * of another device.
+ *
+ * The receiving device may be in another namespace, so
+ * we have to clear all information in the skb that could
+ * impact namespace isolation.
+ */
+int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
+}
+EXPORT_SYMBOL_GPL(dev_forward_skb);
+
+static inline int deliver_skb(struct sk_buff *skb,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
+{
+ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
+ return -ENOMEM;
+ refcount_inc(&skb->users);
+ return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
+static inline void deliver_ptype_list_skb(struct sk_buff *skb,
+ struct packet_type **pt,
+ struct net_device *orig_dev,
+ __be16 type,
+ struct list_head *ptype_list)
+{
+ struct packet_type *ptype, *pt_prev = *pt;
+
+ list_for_each_entry_rcu(ptype, ptype_list, list) {
+ if (ptype->type != type)
+ continue;
+ if (pt_prev)
+ deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = ptype;
+ }
+ *pt = pt_prev;
+}
+
+static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
+{
+ if (!ptype->af_packet_priv || !skb->sk)
+ return false;
+
+ if (ptype->id_match)
+ return ptype->id_match(ptype, skb->sk);
+ else if ((struct sock *)ptype->af_packet_priv == skb->sk)
+ return true;
+
+ return false;
+}
+
+/*
+ * Support routine. Sends outgoing frames to any network
+ * taps currently in use.
+ */
+
+void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct packet_type *ptype;
+ struct sk_buff *skb2 = NULL;
+ struct packet_type *pt_prev = NULL;
+ struct list_head *ptype_list = &ptype_all;
+
+ rcu_read_lock();
+again:
+ list_for_each_entry_rcu(ptype, ptype_list, list) {
+ /* Never send packets back to the socket
+ * they originated from - MvS (miquels@drinkel.ow.org)
+ */
+ if (skb_loop_sk(ptype, skb))
+ continue;
+
+ if (pt_prev) {
+ deliver_skb(skb2, pt_prev, skb->dev);
+ pt_prev = ptype;
+ continue;
+ }
+
+ /* need to clone skb, done only once */
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2)
+ goto out_unlock;
+
+ net_timestamp_set(skb2);
+
+ /* skb->nh should be correctly
+ * set by sender, so that the second statement is
+ * just protection against buggy protocols.
+ */
+ skb_reset_mac_header(skb2);
+
+ if (skb_network_header(skb2) < skb2->data ||
+ skb_network_header(skb2) > skb_tail_pointer(skb2)) {
+ net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
+ ntohs(skb2->protocol),
+ dev->name);
+ skb_reset_network_header(skb2);
+ }
+
+ skb2->transport_header = skb2->network_header;
+ skb2->pkt_type = PACKET_OUTGOING;
+ pt_prev = ptype;
+ }
+
+ if (ptype_list == &ptype_all) {
+ ptype_list = &dev->ptype_all;
+ goto again;
+ }
+out_unlock:
+ if (pt_prev) {
+ if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
+ pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
+ else
+ kfree_skb(skb2);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
+
+/**
+ * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
+ * @dev: Network device
+ * @txq: number of queues available
+ *
+ * If real_num_tx_queues is changed the tc mappings may no longer be
+ * valid. To resolve this verify the tc mapping remains valid and if
+ * not NULL the mapping. With no priorities mapping to this
+ * offset/count pair it will no longer be used. In the worst case TC0
+ * is invalid nothing can be done so disable priority mappings. If is
+ * expected that drivers will fix this mapping if they can before
+ * calling netif_set_real_num_tx_queues.
+ */
+static void netif_setup_tc(struct net_device *dev, unsigned int txq)
+{
+ int i;
+ struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+
+ /* If TC0 is invalidated disable TC mapping */
+ if (tc->offset + tc->count > txq) {
+ pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
+ dev->num_tc = 0;
+ return;
+ }
+
+ /* Invalidated prio to tc mappings set to TC0 */
+ for (i = 1; i < TC_BITMASK + 1; i++) {
+ int q = netdev_get_prio_tc_map(dev, i);
+
+ tc = &dev->tc_to_txq[q];
+ if (tc->offset + tc->count > txq) {
+ pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
+ i, q);
+ netdev_set_prio_tc_map(dev, i, 0);
+ }
+ }
+}
+
+int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
+{
+ if (dev->num_tc) {
+ struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+ int i;
+
+ /* walk through the TCs and see if it falls into any of them */
+ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
+ if ((txq - tc->offset) < tc->count)
+ return i;
+ }
+
+ /* didn't find it, just return -1 to indicate no match */
+ return -1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_txq_to_tc);
+
+#ifdef CONFIG_XPS
+struct static_key xps_needed __read_mostly;
+EXPORT_SYMBOL(xps_needed);
+struct static_key xps_rxqs_needed __read_mostly;
+EXPORT_SYMBOL(xps_rxqs_needed);
+static DEFINE_MUTEX(xps_map_mutex);
+#define xmap_dereference(P) \
+ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
+
+static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
+ int tci, u16 index)
+{
+ struct xps_map *map = NULL;
+ int pos;
+
+ if (dev_maps)
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ if (!map)
+ return false;
+
+ for (pos = map->len; pos--;) {
+ if (map->queues[pos] != index)
+ continue;
+
+ if (map->len > 1) {
+ map->queues[pos] = map->queues[--map->len];
+ break;
+ }
+
+ RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
+ kfree_rcu(map, rcu);
+ return false;
+ }
+
+ return true;
+}
+
+static bool remove_xps_queue_cpu(struct net_device *dev,
+ struct xps_dev_maps *dev_maps,
+ int cpu, u16 offset, u16 count)
+{
+ int num_tc = dev->num_tc ? : 1;
+ bool active = false;
+ int tci;
+
+ for (tci = cpu * num_tc; num_tc--; tci++) {
+ int i, j;
+
+ for (i = count, j = offset; i--; j++) {
+ if (!remove_xps_queue(dev_maps, tci, j))
+ break;
+ }
+
+ active |= i < 0;
+ }
+
+ return active;
+}
+
+static void reset_xps_maps(struct net_device *dev,
+ struct xps_dev_maps *dev_maps,
+ bool is_rxqs_map)
+{
+ if (is_rxqs_map) {
+ static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
+ RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
+ } else {
+ RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
+ }
+ static_key_slow_dec_cpuslocked(&xps_needed);
+ kfree_rcu(dev_maps, rcu);
+}
+
+static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
+ struct xps_dev_maps *dev_maps, unsigned int nr_ids,
+ u16 offset, u16 count, bool is_rxqs_map)
+{
+ bool active = false;
+ int i, j;
+
+ for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
+ j < nr_ids;)
+ active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
+ count);
+ if (!active)
+ reset_xps_maps(dev, dev_maps, is_rxqs_map);
+
+ if (!is_rxqs_map) {
+ for (i = offset + (count - 1); count--; i--) {
+ netdev_queue_numa_node_write(
+ netdev_get_tx_queue(dev, i),
+ NUMA_NO_NODE);
+ }
+ }
+}
+
+static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
+ u16 count)
+{
+ const unsigned long *possible_mask = NULL;
+ struct xps_dev_maps *dev_maps;
+ unsigned int nr_ids;
+
+ if (!static_key_false(&xps_needed))
+ return;
+
+ cpus_read_lock();
+ mutex_lock(&xps_map_mutex);
+
+ if (static_key_false(&xps_rxqs_needed)) {
+ dev_maps = xmap_dereference(dev->xps_rxqs_map);
+ if (dev_maps) {
+ nr_ids = dev->num_rx_queues;
+ clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
+ offset, count, true);
+ }
+ }
+
+ dev_maps = xmap_dereference(dev->xps_cpus_map);
+ if (!dev_maps)
+ goto out_no_maps;
+
+ if (num_possible_cpus() > 1)
+ possible_mask = cpumask_bits(cpu_possible_mask);
+ nr_ids = nr_cpu_ids;
+ clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
+ false);
+
+out_no_maps:
+ mutex_unlock(&xps_map_mutex);
+ cpus_read_unlock();
+}
+
+static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
+{
+ netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
+}
+
+static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
+ u16 index, bool is_rxqs_map)
+{
+ struct xps_map *new_map;
+ int alloc_len = XPS_MIN_MAP_ALLOC;
+ int i, pos;
+
+ for (pos = 0; map && pos < map->len; pos++) {
+ if (map->queues[pos] != index)
+ continue;
+ return map;
+ }
+
+ /* Need to add tx-queue to this CPU's/rx-queue's existing map */
+ if (map) {
+ if (pos < map->alloc_len)
+ return map;
+
+ alloc_len = map->alloc_len * 2;
+ }
+
+ /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
+ * map
+ */
+ if (is_rxqs_map)
+ new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
+ else
+ new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
+ cpu_to_node(attr_index));
+ if (!new_map)
+ return NULL;
+
+ for (i = 0; i < pos; i++)
+ new_map->queues[i] = map->queues[i];
+ new_map->alloc_len = alloc_len;
+ new_map->len = pos;
+
+ return new_map;
+}
+
+/* Must be called under cpus_read_lock */
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+ u16 index, bool is_rxqs_map)
+{
+ const unsigned long *online_mask = NULL, *possible_mask = NULL;
+ struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+ int i, j, tci, numa_node_id = -2;
+ int maps_sz, num_tc = 1, tc = 0;
+ struct xps_map *map, *new_map;
+ bool active = false;
+ unsigned int nr_ids;
+
+ if (dev->num_tc) {
+ /* Do not allow XPS on subordinate device directly */
+ num_tc = dev->num_tc;
+ if (num_tc < 0)
+ return -EINVAL;
+
+ /* If queue belongs to subordinate dev use its map */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0)
+ return -EINVAL;
+ }
+
+ mutex_lock(&xps_map_mutex);
+ if (is_rxqs_map) {
+ maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
+ dev_maps = xmap_dereference(dev->xps_rxqs_map);
+ nr_ids = dev->num_rx_queues;
+ } else {
+ maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
+ if (num_possible_cpus() > 1) {
+ online_mask = cpumask_bits(cpu_online_mask);
+ possible_mask = cpumask_bits(cpu_possible_mask);
+ }
+ dev_maps = xmap_dereference(dev->xps_cpus_map);
+ nr_ids = nr_cpu_ids;
+ }
+
+ if (maps_sz < L1_CACHE_BYTES)
+ maps_sz = L1_CACHE_BYTES;
+
+ /* allocate memory for queue storage */
+ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
+ j < nr_ids;) {
+ if (!new_dev_maps)
+ new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
+ if (!new_dev_maps) {
+ mutex_unlock(&xps_map_mutex);
+ return -ENOMEM;
+ }
+
+ tci = j * num_tc + tc;
+ map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
+ NULL;
+
+ map = expand_xps_map(map, j, index, is_rxqs_map);
+ if (!map)
+ goto error;
+
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
+ }
+
+ if (!new_dev_maps)
+ goto out_no_new_maps;
+
+ if (!dev_maps) {
+ /* Increment static keys at most once per type */
+ static_key_slow_inc_cpuslocked(&xps_needed);
+ if (is_rxqs_map)
+ static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
+ }
+
+ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
+ /* copy maps belonging to foreign traffic classes */
+ for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
+ }
+
+ /* We need to explicitly update tci as prevous loop
+ * could break out early if dev_maps is NULL.
+ */
+ tci = j * num_tc + tc;
+
+ if (netif_attr_test_mask(j, mask, nr_ids) &&
+ netif_attr_test_online(j, online_mask, nr_ids)) {
+ /* add tx-queue to CPU/rx-queue maps */
+ int pos = 0;
+
+ map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ while ((pos < map->len) && (map->queues[pos] != index))
+ pos++;
+
+ if (pos == map->len)
+ map->queues[map->len++] = index;
+#ifdef CONFIG_NUMA
+ if (!is_rxqs_map) {
+ if (numa_node_id == -2)
+ numa_node_id = cpu_to_node(j);
+ else if (numa_node_id != cpu_to_node(j))
+ numa_node_id = -1;
+ }
+#endif
+ } else if (dev_maps) {
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
+ }
+
+ /* copy maps belonging to foreign traffic classes */
+ for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
+ }
+ }
+
+ if (is_rxqs_map)
+ rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
+ else
+ rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
+
+ /* Cleanup old maps */
+ if (!dev_maps)
+ goto out_no_old_maps;
+
+ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
+ for (i = num_tc, tci = j * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ if (map && map != new_map)
+ kfree_rcu(map, rcu);
+ }
+ }
+
+ kfree_rcu(dev_maps, rcu);
+
+out_no_old_maps:
+ dev_maps = new_dev_maps;
+ active = true;
+
+out_no_new_maps:
+ if (!is_rxqs_map) {
+ /* update Tx queue numa node */
+ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
+ (numa_node_id >= 0) ?
+ numa_node_id : NUMA_NO_NODE);
+ }
+
+ if (!dev_maps)
+ goto out_no_maps;
+
+ /* removes tx-queue from unused CPUs/rx-queues */
+ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
+ for (i = tc, tci = j * num_tc; i--; tci++)
+ active |= remove_xps_queue(dev_maps, tci, index);
+ if (!netif_attr_test_mask(j, mask, nr_ids) ||
+ !netif_attr_test_online(j, online_mask, nr_ids))
+ active |= remove_xps_queue(dev_maps, tci, index);
+ for (i = num_tc - tc, tci++; --i; tci++)
+ active |= remove_xps_queue(dev_maps, tci, index);
+ }
+
+ /* free map if not active */
+ if (!active)
+ reset_xps_maps(dev, dev_maps, is_rxqs_map);
+
+out_no_maps:
+ mutex_unlock(&xps_map_mutex);
+
+ return 0;
+error:
+ /* remove any maps that we added */
+ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
+ for (i = num_tc, tci = j * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ map = dev_maps ?
+ xmap_dereference(dev_maps->attr_map[tci]) :
+ NULL;
+ if (new_map && new_map != map)
+ kfree(new_map);
+ }
+ }
+
+ mutex_unlock(&xps_map_mutex);
+
+ kfree(new_dev_maps);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
+
+int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
+ u16 index)
+{
+ int ret;
+
+ cpus_read_lock();
+ ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
+ cpus_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(netif_set_xps_queue);
+
+#endif
+static void netdev_unbind_all_sb_channels(struct net_device *dev)
+{
+ struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
+
+ /* Unbind any subordinate channels */
+ while (txq-- != &dev->_tx[0]) {
+ if (txq->sb_dev)
+ netdev_unbind_sb_channel(dev, txq->sb_dev);
+ }
+}
+
+void netdev_reset_tc(struct net_device *dev)
+{
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ netdev_unbind_all_sb_channels(dev);
+
+ /* Reset TC configuration of device */
+ dev->num_tc = 0;
+ memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
+ memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
+}
+EXPORT_SYMBOL(netdev_reset_tc);
+
+int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
+{
+ if (tc >= dev->num_tc)
+ return -EINVAL;
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues(dev, offset, count);
+#endif
+ dev->tc_to_txq[tc].count = count;
+ dev->tc_to_txq[tc].offset = offset;
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_tc_queue);
+
+int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
+{
+ if (num_tc > TC_MAX_QUEUE)
+ return -EINVAL;
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ netdev_unbind_all_sb_channels(dev);
+
+ dev->num_tc = num_tc;
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_num_tc);
+
+void netdev_unbind_sb_channel(struct net_device *dev,
+ struct net_device *sb_dev)
+{
+ struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(sb_dev, 0);
+#endif
+ memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
+ memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
+
+ while (txq-- != &dev->_tx[0]) {
+ if (txq->sb_dev == sb_dev)
+ txq->sb_dev = NULL;
+ }
+}
+EXPORT_SYMBOL(netdev_unbind_sb_channel);
+
+int netdev_bind_sb_channel_queue(struct net_device *dev,
+ struct net_device *sb_dev,
+ u8 tc, u16 count, u16 offset)
+{
+ /* Make certain the sb_dev and dev are already configured */
+ if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
+ return -EINVAL;
+
+ /* We cannot hand out queues we don't have */
+ if ((offset + count) > dev->real_num_tx_queues)
+ return -EINVAL;
+
+ /* Record the mapping */
+ sb_dev->tc_to_txq[tc].count = count;
+ sb_dev->tc_to_txq[tc].offset = offset;
+
+ /* Provide a way for Tx queue to find the tc_to_txq map or
+ * XPS map for itself.
+ */
+ while (count--)
+ netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
+
+int netdev_set_sb_channel(struct net_device *dev, u16 channel)
+{
+ /* Do not use a multiqueue device to represent a subordinate channel */
+ if (netif_is_multiqueue(dev))
+ return -ENODEV;
+
+ /* We allow channels 1 - 32767 to be used for subordinate channels.
+ * Channel 0 is meant to be "native" mode and used only to represent
+ * the main root device. We allow writing 0 to reset the device back
+ * to normal mode after being used as a subordinate channel.
+ */
+ if (channel > S16_MAX)
+ return -EINVAL;
+
+ dev->num_tc = -channel;
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_sb_channel);
+
+/*
+ * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
+ * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
+ */
+int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
+{
+ bool disabling;
+ int rc;
+
+ disabling = txq < dev->real_num_tx_queues;
+
+ if (txq < 1 || txq > dev->num_tx_queues)
+ return -EINVAL;
+
+ if (dev->reg_state == NETREG_REGISTERED ||
+ dev->reg_state == NETREG_UNREGISTERING) {
+ ASSERT_RTNL();
+
+ rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
+ txq);
+ if (rc)
+ return rc;
+
+ if (dev->num_tc)
+ netif_setup_tc(dev, txq);
+
+ dev_qdisc_change_real_num_tx(dev, txq);
+
+ dev->real_num_tx_queues = txq;
+
+ if (disabling) {
+ synchronize_net();
+ qdisc_reset_all_tx_gt(dev, txq);
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, txq);
+#endif
+ }
+ } else {
+ dev->real_num_tx_queues = txq;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(netif_set_real_num_tx_queues);
+
+#ifdef CONFIG_SYSFS
+/**
+ * netif_set_real_num_rx_queues - set actual number of RX queues used
+ * @dev: Network device
+ * @rxq: Actual number of RX queues
+ *
+ * This must be called either with the rtnl_lock held or before
+ * registration of the net device. Returns 0 on success, or a
+ * negative error code. If called before registration, it always
+ * succeeds.
+ */
+int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
+{
+ int rc;
+
+ if (rxq < 1 || rxq > dev->num_rx_queues)
+ return -EINVAL;
+
+ if (dev->reg_state == NETREG_REGISTERED) {
+ ASSERT_RTNL();
+
+ rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
+ rxq);
+ if (rc)
+ return rc;
+ }
+
+ dev->real_num_rx_queues = rxq;
+ return 0;
+}
+EXPORT_SYMBOL(netif_set_real_num_rx_queues);
+#endif
+
+/**
+ * netif_get_num_default_rss_queues - default number of RSS queues
+ *
+ * This routine should set an upper limit on the number of RSS queues
+ * used by default by multiqueue devices.
+ */
+int netif_get_num_default_rss_queues(void)
+{
+ return is_kdump_kernel() ?
+ 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
+}
+EXPORT_SYMBOL(netif_get_num_default_rss_queues);
+
+static void __netif_reschedule(struct Qdisc *q)
+{
+ struct softnet_data *sd;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ sd = this_cpu_ptr(&softnet_data);
+ q->next_sched = NULL;
+ *sd->output_queue_tailp = q;
+ sd->output_queue_tailp = &q->next_sched;
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+void __netif_schedule(struct Qdisc *q)
+{
+ if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
+ __netif_reschedule(q);
+}
+EXPORT_SYMBOL(__netif_schedule);
+
+struct dev_kfree_skb_cb {
+ enum skb_free_reason reason;
+};
+
+static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
+{
+ return (struct dev_kfree_skb_cb *)skb->cb;
+}
+
+void netif_schedule_queue(struct netdev_queue *txq)
+{
+ rcu_read_lock();
+ if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
+ struct Qdisc *q = rcu_dereference(txq->qdisc);
+
+ __netif_schedule(q);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(netif_schedule_queue);
+
+void netif_tx_wake_queue(struct netdev_queue *dev_queue)
+{
+ if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
+ struct Qdisc *q;
+
+ rcu_read_lock();
+ q = rcu_dereference(dev_queue->qdisc);
+ __netif_schedule(q);
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(netif_tx_wake_queue);
+
+void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
+{
+ unsigned long flags;
+
+ if (unlikely(!skb))
+ return;
+
+ if (likely(refcount_read(&skb->users) == 1)) {
+ smp_rmb();
+ refcount_set(&skb->users, 0);
+ } else if (likely(!refcount_dec_and_test(&skb->users))) {
+ return;
+ }
+ get_kfree_skb_cb(skb)->reason = reason;
+ local_irq_save(flags);
+ skb->next = __this_cpu_read(softnet_data.completion_queue);
+ __this_cpu_write(softnet_data.completion_queue, skb);
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__dev_kfree_skb_irq);
+
+void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
+{
+ if (in_irq() || irqs_disabled())
+ __dev_kfree_skb_irq(skb, reason);
+ else
+ dev_kfree_skb(skb);
+}
+EXPORT_SYMBOL(__dev_kfree_skb_any);
+
+
+/**
+ * netif_device_detach - mark device as removed
+ * @dev: network device
+ *
+ * Mark device as removed from system and therefore no longer available.
+ */
+void netif_device_detach(struct net_device *dev)
+{
+ if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
+ netif_running(dev)) {
+ netif_tx_stop_all_queues(dev);
+ }
+}
+EXPORT_SYMBOL(netif_device_detach);
+
+/**
+ * netif_device_attach - mark device as attached
+ * @dev: network device
+ *
+ * Mark device as attached from system and restart if needed.
+ */
+void netif_device_attach(struct net_device *dev)
+{
+ if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
+ netif_running(dev)) {
+ netif_tx_wake_all_queues(dev);
+ __netdev_watchdog_up(dev);
+ }
+}
+EXPORT_SYMBOL(netif_device_attach);
+
+/*
+ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
+ * to be used as a distribution range.
+ */
+static u16 skb_tx_hash(const struct net_device *dev,
+ const struct net_device *sb_dev,
+ struct sk_buff *skb)
+{
+ u32 hash;
+ u16 qoffset = 0;
+ u16 qcount = dev->real_num_tx_queues;
+
+ if (dev->num_tc) {
+ u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+
+ qoffset = sb_dev->tc_to_txq[tc].offset;
+ qcount = sb_dev->tc_to_txq[tc].count;
+ if (unlikely(!qcount)) {
+ net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
+ sb_dev->name, qoffset, tc);
+ qoffset = 0;
+ qcount = dev->real_num_tx_queues;
+ }
+ }
+
+ if (skb_rx_queue_recorded(skb)) {
+ hash = skb_get_rx_queue(skb);
+ if (hash >= qoffset)
+ hash -= qoffset;
+ while (unlikely(hash >= qcount))
+ hash -= qcount;
+ return hash + qoffset;
+ }
+
+ return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
+}
+
+static void skb_warn_bad_offload(const struct sk_buff *skb)
+{
+ static const netdev_features_t null_features;
+ struct net_device *dev = skb->dev;
+ const char *name = "";
+
+ if (!net_ratelimit())
+ return;
+
+ if (dev) {
+ if (dev->dev.parent)
+ name = dev_driver_string(dev->dev.parent);
+ else
+ name = netdev_name(dev);
+ }
+ WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
+ "gso_type=%d ip_summed=%d\n",
+ name, dev ? &dev->features : &null_features,
+ skb->sk ? &skb->sk->sk_route_caps : &null_features,
+ skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
+ skb_shinfo(skb)->gso_type, skb->ip_summed);
+}
+
+/*
+ * Invalidate hardware checksum when packet is to be mangled, and
+ * complete checksum manually on outgoing path.
+ */
+int skb_checksum_help(struct sk_buff *skb)
+{
+ __wsum csum;
+ int ret = 0, offset;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ goto out_set_summed;
+
+ if (unlikely(skb_shinfo(skb)->gso_size)) {
+ skb_warn_bad_offload(skb);
+ return -EINVAL;
+ }
+
+ /* Before computing a checksum, we should make sure no frag could
+ * be modified by an external entity : checksum could be wrong.
+ */
+ if (skb_has_shared_frag(skb)) {
+ ret = __skb_linearize(skb);
+ if (ret)
+ goto out;
+ }
+
+ offset = skb_checksum_start_offset(skb);
+ BUG_ON(offset >= skb_headlen(skb));
+ csum = skb_checksum(skb, offset, skb->len - offset, 0);
+
+ offset += skb->csum_offset;
+ BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
+
+ if (skb_cloned(skb) &&
+ !skb_clone_writable(skb, offset + sizeof(__sum16))) {
+ ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ if (ret)
+ goto out;
+ }
+
+ *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
+out_set_summed:
+ skb->ip_summed = CHECKSUM_NONE;
+out:
+ return ret;
+}
+EXPORT_SYMBOL(skb_checksum_help);
+
+int skb_crc32c_csum_help(struct sk_buff *skb)
+{
+ __le32 crc32c_csum;
+ int ret = 0, offset, start;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ goto out;
+
+ if (unlikely(skb_is_gso(skb)))
+ goto out;
+
+ /* Before computing a checksum, we should make sure no frag could
+ * be modified by an external entity : checksum could be wrong.
+ */
+ if (unlikely(skb_has_shared_frag(skb))) {
+ ret = __skb_linearize(skb);
+ if (ret)
+ goto out;
+ }
+ start = skb_checksum_start_offset(skb);
+ offset = start + offsetof(struct sctphdr, checksum);
+ if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (skb_cloned(skb) &&
+ !skb_clone_writable(skb, offset + sizeof(__le32))) {
+ ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ if (ret)
+ goto out;
+ }
+ crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
+ skb->len - start, ~(__u32)0,
+ crc32c_csum_stub));
+ *(__le32 *)(skb->data + offset) = crc32c_csum;
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->csum_not_inet = 0;
+out:
+ return ret;
+}
+
+__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
+{
+ __be16 type = skb->protocol;
+
+ /* Tunnel gso handlers can set protocol to ethernet. */
+ if (type == htons(ETH_P_TEB)) {
+ struct ethhdr *eth;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
+ return 0;
+
+ eth = (struct ethhdr *)skb->data;
+ type = eth->h_proto;
+ }
+
+ return __vlan_get_protocol(skb, type, depth);
+}
+
+/**
+ * skb_mac_gso_segment - mac layer segmentation handler.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ */
+struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_offload *ptype;
+ int vlan_depth = skb->mac_len;
+ __be16 type = skb_network_protocol(skb, &vlan_depth);
+
+ if (unlikely(!type))
+ return ERR_PTR(-EINVAL);
+
+ __skb_pull(skb, vlan_depth);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &offload_base, list) {
+ if (ptype->type == type && ptype->callbacks.gso_segment) {
+ segs = ptype->callbacks.gso_segment(skb, features);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ __skb_push(skb, skb->data - skb_mac_header(skb));
+
+ return segs;
+}
+EXPORT_SYMBOL(skb_mac_gso_segment);
+
+
+/* openvswitch calls this on rx path, so we need a different check.
+ */
+static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
+{
+ if (tx_path)
+ return skb->ip_summed != CHECKSUM_PARTIAL &&
+ skb->ip_summed != CHECKSUM_UNNECESSARY;
+
+ return skb->ip_summed == CHECKSUM_NONE;
+}
+
+/**
+ * __skb_gso_segment - Perform segmentation on skb.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ * @tx_path: whether it is called in TX path
+ *
+ * This function segments the given skb and returns a list of segments.
+ *
+ * It may return NULL if the skb requires no segmentation. This is
+ * only possible when GSO is used for verifying header integrity.
+ *
+ * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
+ */
+struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
+ netdev_features_t features, bool tx_path)
+{
+ struct sk_buff *segs;
+
+ if (unlikely(skb_needs_check(skb, tx_path))) {
+ int err;
+
+ /* We're going to init ->check field in TCP or UDP header */
+ err = skb_cow_head(skb, 0);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ /* Only report GSO partial support if it will enable us to
+ * support segmentation on this frame without needing additional
+ * work.
+ */
+ if (features & NETIF_F_GSO_PARTIAL) {
+ netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
+ struct net_device *dev = skb->dev;
+
+ partial_features |= dev->features & dev->gso_partial_features;
+ if (!skb_gso_ok(skb, features | partial_features))
+ features &= ~NETIF_F_GSO_PARTIAL;
+ }
+
+ BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
+ sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
+
+ SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
+ SKB_GSO_CB(skb)->encap_level = 0;
+
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ segs = skb_mac_gso_segment(skb, features);
+
+ if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
+ skb_warn_bad_offload(skb);
+
+ return segs;
+}
+EXPORT_SYMBOL(__skb_gso_segment);
+
+/* Take action when hardware reception checksum errors are detected. */
+#ifdef CONFIG_BUG
+void netdev_rx_csum_fault(struct net_device *dev)
+{
+ if (net_ratelimit()) {
+ pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
+ dump_stack();
+ }
+}
+EXPORT_SYMBOL(netdev_rx_csum_fault);
+#endif
+
+/* XXX: check that highmem exists at all on the given machine. */
+static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_HIGHMEM
+ int i;
+
+ if (!(dev->features & NETIF_F_HIGHDMA)) {
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ if (PageHighMem(skb_frag_page(frag)))
+ return 1;
+ }
+ }
+#endif
+ return 0;
+}
+
+/* If MPLS offload request, verify we are testing hardware MPLS features
+ * instead of standard features for the netdev.
+ */
+#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
+{
+ if (eth_p_mpls(type))
+ features &= skb->dev->mpls_features;
+
+ return features;
+}
+#else
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
+{
+ return features;
+}
+#endif
+
+static netdev_features_t harmonize_features(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ int tmp;
+ __be16 type;
+
+ type = skb_network_protocol(skb, &tmp);
+ features = net_mpls_features(skb, features, type);
+
+ if (skb->ip_summed != CHECKSUM_NONE &&
+ !can_checksum_protocol(features, type)) {
+ features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+ }
+ if (illegal_highdma(skb->dev, skb))
+ features &= ~NETIF_F_SG;
+
+ return features;
+}
+
+netdev_features_t passthru_features_check(struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
+{
+ return features;
+}
+EXPORT_SYMBOL(passthru_features_check);
+
+static netdev_features_t dflt_features_check(struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
+{
+ return vlan_features_check(skb, features);
+}
+
+static netdev_features_t gso_features_check(const struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
+{
+ u16 gso_segs = skb_shinfo(skb)->gso_segs;
+
+ if (gso_segs > dev->gso_max_segs)
+ return features & ~NETIF_F_GSO_MASK;
+
+ /* Support for GSO partial features requires software
+ * intervention before we can actually process the packets
+ * so we need to strip support for any partial features now
+ * and we can pull them back in after we have partially
+ * segmented the frame.
+ */
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
+ features &= ~dev->gso_partial_features;
+
+ /* Make sure to clear the IPv4 ID mangling feature if the
+ * IPv4 header has the potential to be fragmented.
+ */
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
+ struct iphdr *iph = skb->encapsulation ?
+ inner_ip_hdr(skb) : ip_hdr(skb);
+
+ if (!(iph->frag_off & htons(IP_DF)))
+ features &= ~NETIF_F_TSO_MANGLEID;
+ }
+
+ return features;
+}
+
+netdev_features_t netif_skb_features(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ netdev_features_t features = dev->features;
+
+ if (skb_is_gso(skb))
+ features = gso_features_check(skb, dev, features);
+
+ /* If encapsulation offload request, verify we are testing
+ * hardware encapsulation features instead of standard
+ * features for the netdev
+ */
+ if (skb->encapsulation)
+ features &= dev->hw_enc_features;
+
+ if (skb_vlan_tagged(skb))
+ features = netdev_intersect_features(features,
+ dev->vlan_features |
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX);
+
+ if (dev->netdev_ops->ndo_features_check)
+ features &= dev->netdev_ops->ndo_features_check(skb, dev,
+ features);
+ else
+ features &= dflt_features_check(skb, dev, features);
+
+ return harmonize_features(skb, features);
+}
+EXPORT_SYMBOL(netif_skb_features);
+
+static int xmit_one(struct sk_buff *skb, struct net_device *dev,
+ struct netdev_queue *txq, bool more)
+{
+ unsigned int len;
+ int rc;
+
+ if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
+ dev_queue_xmit_nit(skb, dev);
+
+ len = skb->len;
+ trace_net_dev_start_xmit(skb, dev);
+ rc = netdev_start_xmit(skb, dev, txq, more);
+ trace_net_dev_xmit(skb, rc, dev, len);
+
+ return rc;
+}
+
+struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
+ struct netdev_queue *txq, int *ret)
+{
+ struct sk_buff *skb = first;
+ int rc = NETDEV_TX_OK;
+
+ while (skb) {
+ struct sk_buff *next = skb->next;
+
+ skb->next = NULL;
+ rc = xmit_one(skb, dev, txq, next != NULL);
+ if (unlikely(!dev_xmit_complete(rc))) {
+ skb->next = next;
+ goto out;
+ }
+
+ skb = next;
+ if (netif_tx_queue_stopped(txq) && skb) {
+ rc = NETDEV_TX_BUSY;
+ break;
+ }
+ }
+
+out:
+ *ret = rc;
+ return skb;
+}
+
+static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (skb_vlan_tag_present(skb) &&
+ !vlan_hw_offload_capable(features, skb->vlan_proto))
+ skb = __vlan_hwaccel_push_inside(skb);
+ return skb;
+}
+
+int skb_csum_hwoffload_help(struct sk_buff *skb,
+ const netdev_features_t features)
+{
+ if (unlikely(skb->csum_not_inet))
+ return !!(features & NETIF_F_SCTP_CRC) ? 0 :
+ skb_crc32c_csum_help(skb);
+
+ return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
+}
+EXPORT_SYMBOL(skb_csum_hwoffload_help);
+
+static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
+{
+ netdev_features_t features;
+
+ features = netif_skb_features(skb);
+ skb = validate_xmit_vlan(skb, features);
+ if (unlikely(!skb))
+ goto out_null;
+
+ skb = sk_validate_xmit_skb(skb, dev);
+ if (unlikely(!skb))
+ goto out_null;
+
+ if (netif_needs_gso(skb, features)) {
+ struct sk_buff *segs;
+
+ segs = skb_gso_segment(skb, features);
+ if (IS_ERR(segs)) {
+ goto out_kfree_skb;
+ } else if (segs) {
+ consume_skb(skb);
+ skb = segs;
+ }
+ } else {
+ if (skb_needs_linearize(skb, features) &&
+ __skb_linearize(skb))
+ goto out_kfree_skb;
+
+ /* If packet is not checksummed and device does not
+ * support checksumming for this protocol, complete
+ * checksumming here.
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ if (skb->encapsulation)
+ skb_set_inner_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ else
+ skb_set_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ if (skb_csum_hwoffload_help(skb, features))
+ goto out_kfree_skb;
+ }
+ }
+
+ skb = validate_xmit_xfrm(skb, features, again);
+
+ return skb;
+
+out_kfree_skb:
+ kfree_skb(skb);
+out_null:
+ atomic_long_inc(&dev->tx_dropped);
+ return NULL;
+}
+
+struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
+{
+ struct sk_buff *next, *head = NULL, *tail;
+
+ for (; skb != NULL; skb = next) {
+ next = skb->next;
+ skb->next = NULL;
+
+ /* in case skb wont be segmented, point to itself */
+ skb->prev = skb;
+
+ skb = validate_xmit_skb(skb, dev, again);
+ if (!skb)
+ continue;
+
+ if (!head)
+ head = skb;
+ else
+ tail->next = skb;
+ /* If skb was segmented, skb->prev points to
+ * the last segment. If not, it still contains skb.
+ */
+ tail = skb->prev;
+ }
+ return head;
+}
+EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
+
+static void qdisc_pkt_len_init(struct sk_buff *skb)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
+
+ /* To get more precise estimation of bytes sent on wire,
+ * we add to pkt_len the headers size of all segments
+ */
+ if (shinfo->gso_size) {
+ unsigned int hdr_len;
+ u16 gso_segs = shinfo->gso_segs;
+
+ /* mac layer + network layer */
+ hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+
+ /* + transport layer */
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
+ const struct tcphdr *th;
+ struct tcphdr _tcphdr;
+
+ th = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_tcphdr), &_tcphdr);
+ if (likely(th))
+ hdr_len += __tcp_hdrlen(th);
+ } else {
+ struct udphdr _udphdr;
+
+ if (skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_udphdr), &_udphdr))
+ hdr_len += sizeof(struct udphdr);
+ }
+
+ if (shinfo->gso_type & SKB_GSO_DODGY)
+ gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
+ shinfo->gso_size);
+
+ qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
+ }
+}
+
+static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
+ struct net_device *dev,
+ struct netdev_queue *txq)
+{
+ spinlock_t *root_lock = qdisc_lock(q);
+ struct sk_buff *to_free = NULL;
+ bool contended;
+ int rc;
+
+ qdisc_calculate_pkt_len(skb, q);
+
+ if (q->flags & TCQ_F_NOLOCK) {
+ if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+ __qdisc_drop(skb, &to_free);
+ rc = NET_XMIT_DROP;
+ } else {
+ rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+ qdisc_run(q);
+ }
+
+ if (unlikely(to_free))
+ kfree_skb_list(to_free);
+ return rc;
+ }
+
+ /*
+ * Heuristic to force contended enqueues to serialize on a
+ * separate lock before trying to get qdisc main lock.
+ * This permits qdisc->running owner to get the lock more
+ * often and dequeue packets faster.
+ */
+ contended = qdisc_is_running(q);
+ if (unlikely(contended))
+ spin_lock(&q->busylock);
+
+ spin_lock(root_lock);
+ if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+ __qdisc_drop(skb, &to_free);
+ rc = NET_XMIT_DROP;
+ } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
+ qdisc_run_begin(q)) {
+ /*
+ * This is a work-conserving queue; there are no old skbs
+ * waiting to be sent out; and the qdisc is not running -
+ * xmit the skb directly.
+ */
+
+ qdisc_bstats_update(q, skb);
+
+ if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
+ if (unlikely(contended)) {
+ spin_unlock(&q->busylock);
+ contended = false;
+ }
+ __qdisc_run(q);
+ }
+
+ qdisc_run_end(q);
+ rc = NET_XMIT_SUCCESS;
+ } else {
+ rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+ if (qdisc_run_begin(q)) {
+ if (unlikely(contended)) {
+ spin_unlock(&q->busylock);
+ contended = false;
+ }
+ __qdisc_run(q);
+ qdisc_run_end(q);
+ }
+ }
+ spin_unlock(root_lock);
+ if (unlikely(to_free))
+ kfree_skb_list(to_free);
+ if (unlikely(contended))
+ spin_unlock(&q->busylock);
+ return rc;
+}
+
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
+static void skb_update_prio(struct sk_buff *skb)
+{
+ const struct netprio_map *map;
+ const struct sock *sk;
+ unsigned int prioidx;
+
+ if (skb->priority)
+ return;
+ map = rcu_dereference_bh(skb->dev->priomap);
+ if (!map)
+ return;
+ sk = skb_to_full_sk(skb);
+ if (!sk)
+ return;
+
+ prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
+
+ if (prioidx < map->priomap_len)
+ skb->priority = map->priomap[prioidx];
+}
+#else
+#define skb_update_prio(skb)
+#endif
+
+/**
+ * dev_loopback_xmit - loop back @skb
+ * @net: network namespace this loopback is happening in
+ * @sk: sk needed to be a netfilter okfn
+ * @skb: buffer to transmit
+ */
+int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ skb_reset_mac_header(skb);
+ __skb_pull(skb, skb_network_offset(skb));
+ skb->pkt_type = PACKET_LOOPBACK;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ WARN_ON(!skb_dst(skb));
+ skb_dst_force(skb);
+ netif_rx_ni(skb);
+ return 0;
+}
+EXPORT_SYMBOL(dev_loopback_xmit);
+
+#ifdef CONFIG_NET_EGRESS
+static struct sk_buff *
+sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
+{
+ struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
+ struct tcf_result cl_res;
+
+ if (!miniq)
+ return skb;
+
+ /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
+ mini_qdisc_bstats_cpu_update(miniq, skb);
+
+ switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
+ case TC_ACT_OK:
+ case TC_ACT_RECLASSIFY:
+ skb->tc_index = TC_H_MIN(cl_res.classid);
+ break;
+ case TC_ACT_SHOT:
+ mini_qdisc_qstats_cpu_drop(miniq);
+ *ret = NET_XMIT_DROP;
+ kfree_skb(skb);
+ return NULL;
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *ret = NET_XMIT_SUCCESS;
+ consume_skb(skb);
+ return NULL;
+ case TC_ACT_REDIRECT:
+ /* No need to push/pop skb's mac_header here on egress! */
+ skb_do_redirect(skb);
+ *ret = NET_XMIT_SUCCESS;
+ return NULL;
+ default:
+ break;
+ }
+
+ return skb;
+}
+#endif /* CONFIG_NET_EGRESS */
+
+#ifdef CONFIG_XPS
+static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
+ struct xps_dev_maps *dev_maps, unsigned int tci)
+{
+ struct xps_map *map;
+ int queue_index = -1;
+
+ if (dev->num_tc) {
+ tci *= dev->num_tc;
+ tci += netdev_get_prio_tc_map(dev, skb->priority);
+ }
+
+ map = rcu_dereference(dev_maps->attr_map[tci]);
+ if (map) {
+ if (map->len == 1)
+ queue_index = map->queues[0];
+ else
+ queue_index = map->queues[reciprocal_scale(
+ skb_get_hash(skb), map->len)];
+ if (unlikely(queue_index >= dev->real_num_tx_queues))
+ queue_index = -1;
+ }
+ return queue_index;
+}
+#endif
+
+static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
+ struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+ struct xps_dev_maps *dev_maps;
+ struct sock *sk = skb->sk;
+ int queue_index = -1;
+
+ if (!static_key_false(&xps_needed))
+ return -1;
+
+ rcu_read_lock();
+ if (!static_key_false(&xps_rxqs_needed))
+ goto get_cpus_map;
+
+ dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
+ if (dev_maps) {
+ int tci = sk_rx_queue_get(sk);
+
+ if (tci >= 0 && tci < dev->num_rx_queues)
+ queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+ tci);
+ }
+
+get_cpus_map:
+ if (queue_index < 0) {
+ dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
+ if (dev_maps) {
+ unsigned int tci = skb->sender_cpu - 1;
+
+ queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+ tci);
+ }
+ }
+ rcu_read_unlock();
+
+ return queue_index;
+#else
+ return -1;
+#endif
+}
+
+u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
+ struct net_device *sb_dev,
+ select_queue_fallback_t fallback)
+{
+ return 0;
+}
+EXPORT_SYMBOL(dev_pick_tx_zero);
+
+u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
+ struct net_device *sb_dev,
+ select_queue_fallback_t fallback)
+{
+ return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
+}
+EXPORT_SYMBOL(dev_pick_tx_cpu_id);
+
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
+ struct net_device *sb_dev)
+{
+ struct sock *sk = skb->sk;
+ int queue_index = sk_tx_queue_get(sk);
+
+ sb_dev = sb_dev ? : dev;
+
+ if (queue_index < 0 || skb->ooo_okay ||
+ queue_index >= dev->real_num_tx_queues) {
+ int new_index = get_xps_queue(dev, sb_dev, skb);
+
+ if (new_index < 0)
+ new_index = skb_tx_hash(dev, sb_dev, skb);
+
+ if (queue_index != new_index && sk &&
+ sk_fullsock(sk) &&
+ rcu_access_pointer(sk->sk_dst_cache))
+ sk_tx_queue_set(sk, new_index);
+
+ queue_index = new_index;
+ }
+
+ return queue_index;
+}
+
+struct netdev_queue *netdev_pick_tx(struct net_device *dev,
+ struct sk_buff *skb,
+ struct net_device *sb_dev)
+{
+ int queue_index = 0;
+
+#ifdef CONFIG_XPS
+ u32 sender_cpu = skb->sender_cpu - 1;
+
+ if (sender_cpu >= (u32)NR_CPUS)
+ skb->sender_cpu = raw_smp_processor_id() + 1;
+#endif
+
+ if (dev->real_num_tx_queues != 1) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_select_queue)
+ queue_index = ops->ndo_select_queue(dev, skb, sb_dev,
+ __netdev_pick_tx);
+ else
+ queue_index = __netdev_pick_tx(dev, skb, sb_dev);
+
+ queue_index = netdev_cap_txqueue(dev, queue_index);
+ }
+
+ skb_set_queue_mapping(skb, queue_index);
+ return netdev_get_tx_queue(dev, queue_index);
+}
+
+/**
+ * __dev_queue_xmit - transmit a buffer
+ * @skb: buffer to transmit
+ * @sb_dev: suboordinate device used for L2 forwarding offload
+ *
+ * Queue a buffer for transmission to a network device. The caller must
+ * have set the device and priority and built the buffer before calling
+ * this function. The function can be called from an interrupt.
+ *
+ * A negative errno code is returned on a failure. A success does not
+ * guarantee the frame will be transmitted as it may be dropped due
+ * to congestion or traffic shaping.
+ *
+ * -----------------------------------------------------------------------------------
+ * I notice this method can also return errors from the queue disciplines,
+ * including NET_XMIT_DROP, which is a positive value. So, errors can also
+ * be positive.
+ *
+ * Regardless of the return value, the skb is consumed, so it is currently
+ * difficult to retry a send to this method. (You can bump the ref count
+ * before sending to hold a reference for retry if you are careful.)
+ *
+ * When calling this method, interrupts MUST be enabled. This is because
+ * the BH enable code must have IRQs enabled so that it will not deadlock.
+ * --BLG
+ */
+static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
+{
+ struct net_device *dev = skb->dev;
+ struct netdev_queue *txq;
+ struct Qdisc *q;
+ int rc = -ENOMEM;
+ bool again = false;
+
+ skb_reset_mac_header(skb);
+
+ if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
+ __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
+
+ /* Disable soft irqs for various locks below. Also
+ * stops preemption for RCU.
+ */
+ rcu_read_lock_bh();
+
+ skb_update_prio(skb);
+
+ qdisc_pkt_len_init(skb);
+#ifdef CONFIG_NET_CLS_ACT
+ skb->tc_at_ingress = 0;
+# ifdef CONFIG_NET_EGRESS
+ if (static_branch_unlikely(&egress_needed_key)) {
+ skb = sch_handle_egress(skb, &rc, dev);
+ if (!skb)
+ goto out;
+ }
+# endif
+#endif
+ /* If device/qdisc don't need skb->dst, release it right now while
+ * its hot in this cpu cache.
+ */
+ if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+ skb_dst_drop(skb);
+ else
+ skb_dst_force(skb);
+
+ txq = netdev_pick_tx(dev, skb, sb_dev);
+ q = rcu_dereference_bh(txq->qdisc);
+
+ trace_net_dev_queue(skb);
+ if (q->enqueue) {
+ rc = __dev_xmit_skb(skb, q, dev, txq);
+ goto out;
+ }
+
+ /* The device has no queue. Common case for software devices:
+ * loopback, all the sorts of tunnels...
+
+ * Really, it is unlikely that netif_tx_lock protection is necessary
+ * here. (f.e. loopback and IP tunnels are clean ignoring statistics
+ * counters.)
+ * However, it is possible, that they rely on protection
+ * made by us here.
+
+ * Check this and shot the lock. It is not prone from deadlocks.
+ *Either shot noqueue qdisc, it is even simpler 8)
+ */
+ if (dev->flags & IFF_UP) {
+ int cpu = smp_processor_id(); /* ok because BHs are off */
+
+ /* Other cpus might concurrently change txq->xmit_lock_owner
+ * to -1 or to their cpu id, but not to our id.
+ */
+ if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
+ if (dev_xmit_recursion())
+ goto recursion_alert;
+
+ skb = validate_xmit_skb(skb, dev, &again);
+ if (!skb)
+ goto out;
+
+ HARD_TX_LOCK(dev, txq, cpu);
+
+ if (!netif_xmit_stopped(txq)) {
+ dev_xmit_recursion_inc();
+ skb = dev_hard_start_xmit(skb, dev, txq, &rc);
+ dev_xmit_recursion_dec();
+ if (dev_xmit_complete(rc)) {
+ HARD_TX_UNLOCK(dev, txq);
+ goto out;
+ }
+ }
+ HARD_TX_UNLOCK(dev, txq);
+ net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
+ dev->name);
+ } else {
+ /* Recursion is detected! It is possible,
+ * unfortunately
+ */
+recursion_alert:
+ net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
+ dev->name);
+ }
+ }
+
+ rc = -ENETDOWN;
+ rcu_read_unlock_bh();
+
+ atomic_long_inc(&dev->tx_dropped);
+ kfree_skb_list(skb);
+ return rc;
+out:
+ rcu_read_unlock_bh();
+ return rc;
+}
+
+int dev_queue_xmit(struct sk_buff *skb)
+{
+ return __dev_queue_xmit(skb, NULL);
+}
+EXPORT_SYMBOL(dev_queue_xmit);
+
+int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
+{
+ return __dev_queue_xmit(skb, sb_dev);
+}
+EXPORT_SYMBOL(dev_queue_xmit_accel);
+
+int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+{
+ struct net_device *dev = skb->dev;
+ struct sk_buff *orig_skb = skb;
+ struct netdev_queue *txq;
+ int ret = NETDEV_TX_BUSY;
+ bool again = false;
+
+ if (unlikely(!netif_running(dev) ||
+ !netif_carrier_ok(dev)))
+ goto drop;
+
+ skb = validate_xmit_skb_list(skb, dev, &again);
+ if (skb != orig_skb)
+ goto drop;
+
+ skb_set_queue_mapping(skb, queue_id);
+ txq = skb_get_tx_queue(dev, skb);
+
+ local_bh_disable();
+
+ dev_xmit_recursion_inc();
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
+ if (!netif_xmit_frozen_or_drv_stopped(txq))
+ ret = netdev_start_xmit(skb, dev, txq, false);
+ HARD_TX_UNLOCK(dev, txq);
+ dev_xmit_recursion_dec();
+
+ local_bh_enable();
+
+ if (!dev_xmit_complete(ret))
+ kfree_skb(skb);
+
+ return ret;
+drop:
+ atomic_long_inc(&dev->tx_dropped);
+ kfree_skb_list(skb);
+ return NET_XMIT_DROP;
+}
+EXPORT_SYMBOL(dev_direct_xmit);
+
+/*************************************************************************
+ * Receiver routines
+ *************************************************************************/
+
+int netdev_max_backlog __read_mostly = 1000;
+EXPORT_SYMBOL(netdev_max_backlog);
+
+int netdev_tstamp_prequeue __read_mostly = 1;
+int netdev_budget __read_mostly = 300;
+/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
+unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
+int weight_p __read_mostly = 64; /* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
+int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
+int dev_rx_weight __read_mostly = 64;
+int dev_tx_weight __read_mostly = 64;
+
+/* Called with irq disabled */
+static inline void ____napi_schedule(struct softnet_data *sd,
+ struct napi_struct *napi)
+{
+ list_add_tail(&napi->poll_list, &sd->poll_list);
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+}
+
+#ifdef CONFIG_RPS
+
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
+EXPORT_SYMBOL(rps_sock_flow_table);
+u32 rps_cpu_mask __read_mostly;
+EXPORT_SYMBOL(rps_cpu_mask);
+
+struct static_key rps_needed __read_mostly;
+EXPORT_SYMBOL(rps_needed);
+struct static_key rfs_needed __read_mostly;
+EXPORT_SYMBOL(rfs_needed);
+
+static struct rps_dev_flow *
+set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow *rflow, u16 next_cpu)
+{
+ if (next_cpu < nr_cpu_ids) {
+#ifdef CONFIG_RFS_ACCEL
+ struct netdev_rx_queue *rxqueue;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *old_rflow;
+ u32 flow_id;
+ u16 rxq_index;
+ int rc;
+
+ /* Should we steer this flow to a different hardware queue? */
+ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
+ !(dev->features & NETIF_F_NTUPLE))
+ goto out;
+ rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
+ if (rxq_index == skb_get_rx_queue(skb))
+ goto out;
+
+ rxqueue = dev->_rx + rxq_index;
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (!flow_table)
+ goto out;
+ flow_id = skb_get_hash(skb) & flow_table->mask;
+ rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
+ rxq_index, flow_id);
+ if (rc < 0)
+ goto out;
+ old_rflow = rflow;
+ rflow = &flow_table->flows[flow_id];
+ rflow->filter = rc;
+ if (old_rflow->filter == rflow->filter)
+ old_rflow->filter = RPS_NO_FILTER;
+ out:
+#endif
+ rflow->last_qtail =
+ per_cpu(softnet_data, next_cpu).input_queue_head;
+ }
+
+ rflow->cpu = next_cpu;
+ return rflow;
+}
+
+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving queue for a given skb.
+ * rcu_read_lock must be held on entry.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow **rflowp)
+{
+ const struct rps_sock_flow_table *sock_flow_table;
+ struct netdev_rx_queue *rxqueue = dev->_rx;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_map *map;
+ int cpu = -1;
+ u32 tcpu;
+ u32 hash;
+
+ if (skb_rx_queue_recorded(skb)) {
+ u16 index = skb_get_rx_queue(skb);
+
+ if (unlikely(index >= dev->real_num_rx_queues)) {
+ WARN_ONCE(dev->real_num_rx_queues > 1,
+ "%s received packet on queue %u, but number "
+ "of RX queues is %u\n",
+ dev->name, index, dev->real_num_rx_queues);
+ goto done;
+ }
+ rxqueue += index;
+ }
+
+ /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
+
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ map = rcu_dereference(rxqueue->rps_map);
+ if (!flow_table && !map)
+ goto done;
+
+ skb_reset_network_header(skb);
+ hash = skb_get_hash(skb);
+ if (!hash)
+ goto done;
+
+ sock_flow_table = rcu_dereference(rps_sock_flow_table);
+ if (flow_table && sock_flow_table) {
+ struct rps_dev_flow *rflow;
+ u32 next_cpu;
+ u32 ident;
+
+ /* First check into global flow table if there is a match */
+ ident = sock_flow_table->ents[hash & sock_flow_table->mask];
+ if ((ident ^ hash) & ~rps_cpu_mask)
+ goto try_rps;
+
+ next_cpu = ident & rps_cpu_mask;
+
+ /* OK, now we know there is a match,
+ * we can look at the local (per receive queue) flow table
+ */
+ rflow = &flow_table->flows[hash & flow_table->mask];
+ tcpu = rflow->cpu;
+
+ /*
+ * If the desired CPU (where last recvmsg was done) is
+ * different from current CPU (one in the rx-queue flow
+ * table entry), switch if one of the following holds:
+ * - Current CPU is unset (>= nr_cpu_ids).
+ * - Current CPU is offline.
+ * - The current CPU's queue tail has advanced beyond the
+ * last packet that was enqueued using this table entry.
+ * This guarantees that all previous packets for the flow
+ * have been dequeued, thus preserving in order delivery.
+ */
+ if (unlikely(tcpu != next_cpu) &&
+ (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
+ ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
+ rflow->last_qtail)) >= 0)) {
+ tcpu = next_cpu;
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+ }
+
+ if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
+ *rflowp = rflow;
+ cpu = tcpu;
+ goto done;
+ }
+ }
+
+try_rps:
+
+ if (map) {
+ tcpu = map->cpus[reciprocal_scale(hash, map->len)];
+ if (cpu_online(tcpu)) {
+ cpu = tcpu;
+ goto done;
+ }
+ }
+
+done:
+ return cpu;
+}
+
+#ifdef CONFIG_RFS_ACCEL
+
+/**
+ * rps_may_expire_flow - check whether an RFS hardware filter may be removed
+ * @dev: Device on which the filter was set
+ * @rxq_index: RX queue index
+ * @flow_id: Flow ID passed to ndo_rx_flow_steer()
+ * @filter_id: Filter ID returned by ndo_rx_flow_steer()
+ *
+ * Drivers that implement ndo_rx_flow_steer() should periodically call
+ * this function for each installed filter and remove the filters for
+ * which it returns %true.
+ */
+bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+ u32 flow_id, u16 filter_id)
+{
+ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *rflow;
+ bool expire = true;
+ unsigned int cpu;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (flow_table && flow_id <= flow_table->mask) {
+ rflow = &flow_table->flows[flow_id];
+ cpu = READ_ONCE(rflow->cpu);
+ if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
+ ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+ rflow->last_qtail) <
+ (int)(10 * flow_table->mask)))
+ expire = false;
+ }
+ rcu_read_unlock();
+ return expire;
+}
+EXPORT_SYMBOL(rps_may_expire_flow);
+
+#endif /* CONFIG_RFS_ACCEL */
+
+/* Called from hardirq (IPI) context */
+static void rps_trigger_softirq(void *data)
+{
+ struct softnet_data *sd = data;
+
+ ____napi_schedule(sd, &sd->backlog);
+ sd->received_rps++;
+}
+
+#endif /* CONFIG_RPS */
+
+/*
+ * Check if this softnet_data structure is another cpu one
+ * If yes, queue it to our IPI list and return 1
+ * If no, return 0
+ */
+static int rps_ipi_queued(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
+
+ if (sd != mysd) {
+ sd->rps_ipi_next = mysd->rps_ipi_list;
+ mysd->rps_ipi_list = sd;
+
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ return 1;
+ }
+#endif /* CONFIG_RPS */
+ return 0;
+}
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+ struct sd_flow_limit *fl;
+ struct softnet_data *sd;
+ unsigned int old_flow, new_flow;
+
+ if (qlen < (netdev_max_backlog >> 1))
+ return false;
+
+ sd = this_cpu_ptr(&softnet_data);
+
+ rcu_read_lock();
+ fl = rcu_dereference(sd->flow_limit);
+ if (fl) {
+ new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
+ old_flow = fl->history[fl->history_head];
+ fl->history[fl->history_head] = new_flow;
+
+ fl->history_head++;
+ fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+
+ if (likely(fl->buckets[old_flow]))
+ fl->buckets[old_flow]--;
+
+ if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+ fl->count++;
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+#endif
+ return false;
+}
+
+/*
+ * enqueue_to_backlog is called to queue an skb to a per CPU backlog
+ * queue (may be a remote CPU queue).
+ */
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
+ unsigned int *qtail)
+{
+ struct softnet_data *sd;
+ unsigned long flags;
+ unsigned int qlen;
+
+ sd = &per_cpu(softnet_data, cpu);
+
+ local_irq_save(flags);
+
+ rps_lock(sd);
+ if (!netif_running(skb->dev))
+ goto drop;
+ qlen = skb_queue_len(&sd->input_pkt_queue);
+ if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
+ if (qlen) {
+enqueue:
+ __skb_queue_tail(&sd->input_pkt_queue, skb);
+ input_queue_tail_incr_save(sd, qtail);
+ rps_unlock(sd);
+ local_irq_restore(flags);
+ return NET_RX_SUCCESS;
+ }
+
+ /* Schedule NAPI for backlog device
+ * We can use non atomic operation since we own the queue lock
+ */
+ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
+ if (!rps_ipi_queued(sd))
+ ____napi_schedule(sd, &sd->backlog);
+ }
+ goto enqueue;
+ }
+
+drop:
+ sd->dropped++;
+ rps_unlock(sd);
+
+ local_irq_restore(flags);
+
+ atomic_long_inc(&skb->dev->rx_dropped);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct netdev_rx_queue *rxqueue;
+
+ rxqueue = dev->_rx;
+
+ if (skb_rx_queue_recorded(skb)) {
+ u16 index = skb_get_rx_queue(skb);
+
+ if (unlikely(index >= dev->real_num_rx_queues)) {
+ WARN_ONCE(dev->real_num_rx_queues > 1,
+ "%s received packet on queue %u, but number "
+ "of RX queues is %u\n",
+ dev->name, index, dev->real_num_rx_queues);
+
+ return rxqueue; /* Return first rxqueue */
+ }
+ rxqueue += index;
+ }
+ return rxqueue;
+}
+
+static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+ struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog)
+{
+ struct netdev_rx_queue *rxqueue;
+ void *orig_data, *orig_data_end;
+ u32 metalen, act = XDP_DROP;
+ __be16 orig_eth_type;
+ struct ethhdr *eth;
+ bool orig_bcast;
+ int hlen, off;
+ u32 mac_len;
+
+ /* Reinjected packets coming from act_mirred or similar should
+ * not get XDP generic processing.
+ */
+ if (skb_is_tc_redirected(skb))
+ return XDP_PASS;
+
+ /* XDP packets must be linear and must have sufficient headroom
+ * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
+ * native XDP provides, thus we need to do it here as well.
+ */
+ if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
+ skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+ int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+ int troom = skb->tail + skb->data_len - skb->end;
+
+ /* In case we have to go down the path and also linearize,
+ * then lets do the pskb_expand_head() work just once here.
+ */
+ if (pskb_expand_head(skb,
+ hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+ troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
+ goto do_drop;
+ if (skb_linearize(skb))
+ goto do_drop;
+ }
+
+ /* The XDP program wants to see the packet starting at the MAC
+ * header.
+ */
+ mac_len = skb->data - skb_mac_header(skb);
+ hlen = skb_headlen(skb) + mac_len;
+ xdp->data = skb->data - mac_len;
+ xdp->data_meta = xdp->data;
+ xdp->data_end = xdp->data + hlen;
+ xdp->data_hard_start = skb->data - skb_headroom(skb);
+ orig_data_end = xdp->data_end;
+ orig_data = xdp->data;
+ eth = (struct ethhdr *)xdp->data;
+ orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
+ orig_eth_type = eth->h_proto;
+
+ rxqueue = netif_get_rxqueue(skb);
+ xdp->rxq = &rxqueue->xdp_rxq;
+
+ act = bpf_prog_run_xdp(xdp_prog, xdp);
+
+ /* check if bpf_xdp_adjust_head was used */
+ off = xdp->data - orig_data;
+ if (off) {
+ if (off > 0)
+ __skb_pull(skb, off);
+ else if (off < 0)
+ __skb_push(skb, -off);
+
+ skb->mac_header += off;
+ skb_reset_network_header(skb);
+ }
+
+ /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
+ * pckt.
+ */
+ off = orig_data_end - xdp->data_end;
+ if (off != 0) {
+ skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
+ skb->len -= off;
+
+ }
+
+ /* check if XDP changed eth hdr such SKB needs update */
+ eth = (struct ethhdr *)xdp->data;
+ if ((orig_eth_type != eth->h_proto) ||
+ (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
+ __skb_push(skb, ETH_HLEN);
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ }
+
+ switch (act) {
+ case XDP_REDIRECT:
+ case XDP_TX:
+ __skb_push(skb, mac_len);
+ break;
+ case XDP_PASS:
+ metalen = xdp->data - xdp->data_meta;
+ if (metalen)
+ skb_metadata_set(skb, metalen);
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fall through */
+ case XDP_ABORTED:
+ trace_xdp_exception(skb->dev, xdp_prog, act);
+ /* fall through */
+ case XDP_DROP:
+ do_drop:
+ kfree_skb(skb);
+ break;
+ }
+
+ return act;
+}
+
+/* When doing generic XDP we have to bypass the qdisc layer and the
+ * network taps in order to match in-driver-XDP behavior.
+ */
+void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
+{
+ struct net_device *dev = skb->dev;
+ struct netdev_queue *txq;
+ bool free_skb = true;
+ int cpu, rc;
+
+ txq = netdev_pick_tx(dev, skb, NULL);
+ cpu = smp_processor_id();
+ HARD_TX_LOCK(dev, txq, cpu);
+ if (!netif_xmit_stopped(txq)) {
+ rc = netdev_start_xmit(skb, dev, txq, 0);
+ if (dev_xmit_complete(rc))
+ free_skb = false;
+ }
+ HARD_TX_UNLOCK(dev, txq);
+ if (free_skb) {
+ trace_xdp_exception(dev, xdp_prog, XDP_TX);
+ kfree_skb(skb);
+ }
+}
+EXPORT_SYMBOL_GPL(generic_xdp_tx);
+
+static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
+
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
+{
+ if (xdp_prog) {
+ struct xdp_buff xdp;
+ u32 act;
+ int err;
+
+ act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
+ if (act != XDP_PASS) {
+ switch (act) {
+ case XDP_REDIRECT:
+ err = xdp_do_generic_redirect(skb->dev, skb,
+ &xdp, xdp_prog);
+ if (err)
+ goto out_redir;
+ break;
+ case XDP_TX:
+ generic_xdp_tx(skb, xdp_prog);
+ break;
+ }
+ return XDP_DROP;
+ }
+ }
+ return XDP_PASS;
+out_redir:
+ kfree_skb(skb);
+ return XDP_DROP;
+}
+EXPORT_SYMBOL_GPL(do_xdp_generic);
+
+static int netif_rx_internal(struct sk_buff *skb)
+{
+ int ret;
+
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
+
+ trace_netif_rx(skb);
+
+#ifdef CONFIG_RPS
+ if (static_key_false(&rps_needed)) {
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
+ int cpu;
+
+ preempt_disable();
+ rcu_read_lock();
+
+ cpu = get_rps_cpu(skb->dev, skb, &rflow);
+ if (cpu < 0)
+ cpu = smp_processor_id();
+
+ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+
+ rcu_read_unlock();
+ preempt_enable();
+ } else
+#endif
+ {
+ unsigned int qtail;
+
+ ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
+ put_cpu();
+ }
+ return ret;
+}
+
+/**
+ * netif_rx - post buffer to the network code
+ * @skb: buffer to post
+ *
+ * This function receives a packet from a device driver and queues it for
+ * the upper (protocol) levels to process. It always succeeds. The buffer
+ * may be dropped during processing for congestion control or by the
+ * protocol layers.
+ *
+ * return values:
+ * NET_RX_SUCCESS (no congestion)
+ * NET_RX_DROP (packet was dropped)
+ *
+ */
+
+int netif_rx(struct sk_buff *skb)
+{
+ trace_netif_rx_entry(skb);
+
+ return netif_rx_internal(skb);
+}
+EXPORT_SYMBOL(netif_rx);
+
+int netif_rx_ni(struct sk_buff *skb)
+{
+ int err;
+
+ trace_netif_rx_ni_entry(skb);
+
+ preempt_disable();
+ err = netif_rx_internal(skb);
+ if (local_softirq_pending())
+ do_softirq();
+ preempt_enable();
+
+ return err;
+}
+EXPORT_SYMBOL(netif_rx_ni);
+
+static __latent_entropy void net_tx_action(struct softirq_action *h)
+{
+ struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+
+ if (sd->completion_queue) {
+ struct sk_buff *clist;
+
+ local_irq_disable();
+ clist = sd->completion_queue;
+ sd->completion_queue = NULL;
+ local_irq_enable();
+
+ while (clist) {
+ struct sk_buff *skb = clist;
+
+ clist = clist->next;
+
+ WARN_ON(refcount_read(&skb->users));
+ if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
+ trace_consume_skb(skb);
+ else
+ trace_kfree_skb(skb, net_tx_action);
+
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+ __kfree_skb(skb);
+ else
+ __kfree_skb_defer(skb);
+ }
+
+ __kfree_skb_flush();
+ }
+
+ if (sd->output_queue) {
+ struct Qdisc *head;
+
+ local_irq_disable();
+ head = sd->output_queue;
+ sd->output_queue = NULL;
+ sd->output_queue_tailp = &sd->output_queue;
+ local_irq_enable();
+
+ while (head) {
+ struct Qdisc *q = head;
+ spinlock_t *root_lock = NULL;
+
+ head = head->next_sched;
+
+ if (!(q->flags & TCQ_F_NOLOCK)) {
+ root_lock = qdisc_lock(q);
+ spin_lock(root_lock);
+ }
+ /* We need to make sure head->next_sched is read
+ * before clearing __QDISC_STATE_SCHED
+ */
+ smp_mb__before_atomic();
+ clear_bit(__QDISC_STATE_SCHED, &q->state);
+ qdisc_run(q);
+ if (root_lock)
+ spin_unlock(root_lock);
+ }
+ }
+
+ xfrm_dev_backlog(sd);
+}
+
+#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
+/* This hook is defined here for ATM LANE */
+int (*br_fdb_test_addr_hook)(struct net_device *dev,
+ unsigned char *addr) __read_mostly;
+EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
+#endif
+
+static inline struct sk_buff *
+sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
+ struct net_device *orig_dev)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
+ struct tcf_result cl_res;
+
+ /* If there's at least one ingress present somewhere (so
+ * we get here via enabled static key), remaining devices
+ * that are not configured with an ingress qdisc will bail
+ * out here.
+ */
+ if (!miniq)
+ return skb;
+
+ if (*pt_prev) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
+
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
+ skb->tc_at_ingress = 1;
+ mini_qdisc_bstats_cpu_update(miniq, skb);
+
+ switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
+ case TC_ACT_OK:
+ case TC_ACT_RECLASSIFY:
+ skb->tc_index = TC_H_MIN(cl_res.classid);
+ break;
+ case TC_ACT_SHOT:
+ mini_qdisc_qstats_cpu_drop(miniq);
+ kfree_skb(skb);
+ return NULL;
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ consume_skb(skb);
+ return NULL;
+ case TC_ACT_REDIRECT:
+ /* skb_mac_header check was done by cls/act_bpf, so
+ * we can safely push the L2 header back before
+ * redirecting to another netdev
+ */
+ __skb_push(skb, skb->mac_len);
+ skb_do_redirect(skb);
+ return NULL;
+ case TC_ACT_REINSERT:
+ /* this does not scrub the packet, and updates stats on error */
+ skb_tc_reinsert(skb, &cl_res);
+ return NULL;
+ default:
+ break;
+ }
+#endif /* CONFIG_NET_CLS_ACT */
+ return skb;
+}
+
+/**
+ * netdev_is_rx_handler_busy - check if receive handler is registered
+ * @dev: device to check
+ *
+ * Check if a receive handler is already registered for a given device.
+ * Return true if there one.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+bool netdev_is_rx_handler_busy(struct net_device *dev)
+{
+ ASSERT_RTNL();
+ return dev && rtnl_dereference(dev->rx_handler);
+}
+EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
+
+/**
+ * netdev_rx_handler_register - register receive handler
+ * @dev: device to register a handler for
+ * @rx_handler: receive handler to register
+ * @rx_handler_data: data pointer that is used by rx handler
+ *
+ * Register a receive handler for a device. This handler will then be
+ * called from __netif_receive_skb. A negative errno code is returned
+ * on a failure.
+ *
+ * The caller must hold the rtnl_mutex.
+ *
+ * For a general description of rx_handler, see enum rx_handler_result.
+ */
+int netdev_rx_handler_register(struct net_device *dev,
+ rx_handler_func_t *rx_handler,
+ void *rx_handler_data)
+{
+ if (netdev_is_rx_handler_busy(dev))
+ return -EBUSY;
+
+ if (dev->priv_flags & IFF_NO_RX_HANDLER)
+ return -EINVAL;
+
+ /* Note: rx_handler_data must be set before rx_handler */
+ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
+ rcu_assign_pointer(dev->rx_handler, rx_handler);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
+
+/**
+ * netdev_rx_handler_unregister - unregister receive handler
+ * @dev: device to unregister a handler from
+ *
+ * Unregister a receive handler from a device.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+void netdev_rx_handler_unregister(struct net_device *dev)
+{
+
+ ASSERT_RTNL();
+ RCU_INIT_POINTER(dev->rx_handler, NULL);
+ /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
+ * section has a guarantee to see a non NULL rx_handler_data
+ * as well.
+ */
+ synchronize_net();
+ RCU_INIT_POINTER(dev->rx_handler_data, NULL);
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
+
+/*
+ * Limit the use of PFMEMALLOC reserves to those protocols that implement
+ * the special handling of PFMEMALLOC skbs.
+ */
+static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
+{
+ switch (skb->protocol) {
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_IP):
+ case htons(ETH_P_IPV6):
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
+ int *ret, struct net_device *orig_dev)
+{
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (nf_hook_ingress_active(skb)) {
+ int ingress_retval;
+
+ if (*pt_prev) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
+
+ rcu_read_lock();
+ ingress_retval = nf_hook_ingress(skb);
+ rcu_read_unlock();
+ return ingress_retval;
+ }
+#endif /* CONFIG_NETFILTER_INGRESS */
+ return 0;
+}
+
+static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
+ struct packet_type **ppt_prev)
+{
+ struct packet_type *ptype, *pt_prev;
+ rx_handler_func_t *rx_handler;
+ struct sk_buff *skb = *pskb;
+ struct net_device *orig_dev;
+ bool deliver_exact = false;
+ int ret = NET_RX_DROP;
+ __be16 type;
+
+ net_timestamp_check(!netdev_tstamp_prequeue, skb);
+
+ trace_netif_receive_skb(skb);
+
+ orig_dev = skb->dev;
+
+ skb_reset_network_header(skb);
+ if (!skb_transport_header_was_set(skb))
+ skb_reset_transport_header(skb);
+ skb_reset_mac_len(skb);
+
+ pt_prev = NULL;
+
+another_round:
+ skb->skb_iif = skb->dev->ifindex;
+
+ __this_cpu_inc(softnet_data.processed);
+
+ if (static_branch_unlikely(&generic_xdp_needed_key)) {
+ int ret2;
+
+ preempt_disable();
+ ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
+ preempt_enable();
+
+ if (ret2 != XDP_PASS) {
+ ret = NET_RX_DROP;
+ goto out;
+ }
+ skb_reset_mac_len(skb);
+ }
+
+ if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
+ skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+ skb = skb_vlan_untag(skb);
+ if (unlikely(!skb))
+ goto out;
+ }
+
+ if (skb_skip_tc_classify(skb))
+ goto skip_classify;
+
+ if (pfmemalloc)
+ goto skip_taps;
+
+ list_for_each_entry_rcu(ptype, &ptype_all, list) {
+ if (pt_prev)
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = ptype;
+ }
+
+ list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
+ if (pt_prev)
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = ptype;
+ }
+
+skip_taps:
+#ifdef CONFIG_NET_INGRESS
+ if (static_branch_unlikely(&ingress_needed_key)) {
+ skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
+ if (!skb)
+ goto out;
+
+ if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
+ goto out;
+ }
+#endif
+ skb_reset_tc(skb);
+skip_classify:
+ if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
+ goto drop;
+
+ if (skb_vlan_tag_present(skb)) {
+ if (pt_prev) {
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = NULL;
+ }
+ if (vlan_do_receive(&skb))
+ goto another_round;
+ else if (unlikely(!skb))
+ goto out;
+ }
+
+ rx_handler = rcu_dereference(skb->dev->rx_handler);
+ if (rx_handler) {
+ if (pt_prev) {
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = NULL;
+ }
+ switch (rx_handler(&skb)) {
+ case RX_HANDLER_CONSUMED:
+ ret = NET_RX_SUCCESS;
+ goto out;
+ case RX_HANDLER_ANOTHER:
+ goto another_round;
+ case RX_HANDLER_EXACT:
+ deliver_exact = true;
+ case RX_HANDLER_PASS:
+ break;
+ default:
+ BUG();
+ }
+ }
+
+ if (unlikely(skb_vlan_tag_present(skb))) {
+ if (skb_vlan_tag_get_id(skb))
+ skb->pkt_type = PACKET_OTHERHOST;
+ /* Note: we might in the future use prio bits
+ * and set skb->priority like in vlan_do_receive()
+ * For the time being, just ignore Priority Code Point
+ */
+ skb->vlan_tci = 0;
+ }
+
+ type = skb->protocol;
+
+ /* deliver only exact match when indicated */
+ if (likely(!deliver_exact)) {
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &ptype_base[ntohs(type) &
+ PTYPE_HASH_MASK]);
+ }
+
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &orig_dev->ptype_specific);
+
+ if (unlikely(skb->dev != orig_dev)) {
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &skb->dev->ptype_specific);
+ }
+
+ if (pt_prev) {
+ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
+ goto drop;
+ *ppt_prev = pt_prev;
+ } else {
+drop:
+ if (!deliver_exact)
+ atomic_long_inc(&skb->dev->rx_dropped);
+ else
+ atomic_long_inc(&skb->dev->rx_nohandler);
+ kfree_skb(skb);
+ /* Jamal, now you will not able to escape explaining
+ * me how you were going to use this. :-)
+ */
+ ret = NET_RX_DROP;
+ }
+
+out:
+ /* The invariant here is that if *ppt_prev is not NULL
+ * then skb should also be non-NULL.
+ *
+ * Apparently *ppt_prev assignment above holds this invariant due to
+ * skb dereferencing near it.
+ */
+ *pskb = skb;
+ return ret;
+}
+
+static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
+{
+ struct net_device *orig_dev = skb->dev;
+ struct packet_type *pt_prev = NULL;
+ int ret;
+
+ ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
+ if (pt_prev)
+ ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ return ret;
+}
+
+/**
+ * netif_receive_skb_core - special purpose version of netif_receive_skb
+ * @skb: buffer to process
+ *
+ * More direct receive version of netif_receive_skb(). It should
+ * only be used by callers that have a need to skip RPS and Generic XDP.
+ * Caller must also take care of handling if (page_is_)pfmemalloc.
+ *
+ * This function may only be called from softirq context and interrupts
+ * should be enabled.
+ *
+ * Return values (usually ignored):
+ * NET_RX_SUCCESS: no congestion
+ * NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb_core(struct sk_buff *skb)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = __netif_receive_skb_one_core(skb, false);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(netif_receive_skb_core);
+
+static inline void __netif_receive_skb_list_ptype(struct list_head *head,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
+{
+ struct sk_buff *skb, *next;
+
+ if (!pt_prev)
+ return;
+ if (list_empty(head))
+ return;
+ if (pt_prev->list_func != NULL)
+ pt_prev->list_func(head, pt_prev, orig_dev);
+ else
+ list_for_each_entry_safe(skb, next, head, list) {
+ skb_list_del_init(skb);
+ pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ }
+}
+
+static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
+{
+ /* Fast-path assumptions:
+ * - There is no RX handler.
+ * - Only one packet_type matches.
+ * If either of these fails, we will end up doing some per-packet
+ * processing in-line, then handling the 'last ptype' for the whole
+ * sublist. This can't cause out-of-order delivery to any single ptype,
+ * because the 'last ptype' must be constant across the sublist, and all
+ * other ptypes are handled per-packet.
+ */
+ /* Current (common) ptype of sublist */
+ struct packet_type *pt_curr = NULL;
+ /* Current (common) orig_dev of sublist */
+ struct net_device *od_curr = NULL;
+ struct list_head sublist;
+ struct sk_buff *skb, *next;
+
+ INIT_LIST_HEAD(&sublist);
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct net_device *orig_dev = skb->dev;
+ struct packet_type *pt_prev = NULL;
+
+ skb_list_del_init(skb);
+ __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
+ if (!pt_prev)
+ continue;
+ if (pt_curr != pt_prev || od_curr != orig_dev) {
+ /* dispatch old sublist */
+ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ pt_curr = pt_prev;
+ od_curr = orig_dev;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
+
+ /* dispatch final sublist */
+ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+}
+
+static int __netif_receive_skb(struct sk_buff *skb)
+{
+ int ret;
+
+ if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
+ unsigned int noreclaim_flag;
+
+ /*
+ * PFMEMALLOC skbs are special, they should
+ * - be delivered to SOCK_MEMALLOC sockets only
+ * - stay away from userspace
+ * - have bounded memory usage
+ *
+ * Use PF_MEMALLOC as this saves us from propagating the allocation
+ * context down to all allocation sites.
+ */
+ noreclaim_flag = memalloc_noreclaim_save();
+ ret = __netif_receive_skb_one_core(skb, true);
+ memalloc_noreclaim_restore(noreclaim_flag);
+ } else
+ ret = __netif_receive_skb_one_core(skb, false);
+
+ return ret;
+}
+
+static void __netif_receive_skb_list(struct list_head *head)
+{
+ unsigned long noreclaim_flag = 0;
+ struct sk_buff *skb, *next;
+ bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
+
+ list_for_each_entry_safe(skb, next, head, list) {
+ if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
+ struct list_head sublist;
+
+ /* Handle the previous sublist */
+ list_cut_before(&sublist, head, &skb->list);
+ if (!list_empty(&sublist))
+ __netif_receive_skb_list_core(&sublist, pfmemalloc);
+ pfmemalloc = !pfmemalloc;
+ /* See comments in __netif_receive_skb */
+ if (pfmemalloc)
+ noreclaim_flag = memalloc_noreclaim_save();
+ else
+ memalloc_noreclaim_restore(noreclaim_flag);
+ }
+ }
+ /* Handle the remaining sublist */
+ if (!list_empty(head))
+ __netif_receive_skb_list_core(head, pfmemalloc);
+ /* Restore pflags */
+ if (pfmemalloc)
+ memalloc_noreclaim_restore(noreclaim_flag);
+}
+
+static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
+{
+ struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
+ struct bpf_prog *new = xdp->prog;
+ int ret = 0;
+
+ switch (xdp->command) {
+ case XDP_SETUP_PROG:
+ rcu_assign_pointer(dev->xdp_prog, new);
+ if (old)
+ bpf_prog_put(old);
+
+ if (old && !new) {
+ static_branch_dec(&generic_xdp_needed_key);
+ } else if (new && !old) {
+ static_branch_inc(&generic_xdp_needed_key);
+ dev_disable_lro(dev);
+ dev_disable_gro_hw(dev);
+ }
+ break;
+
+ case XDP_QUERY_PROG:
+ xdp->prog_id = old ? old->aux->id : 0;
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static int netif_receive_skb_internal(struct sk_buff *skb)
+{
+ int ret;
+
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
+
+ if (skb_defer_rx_timestamp(skb))
+ return NET_RX_SUCCESS;
+
+ rcu_read_lock();
+#ifdef CONFIG_RPS
+ if (static_key_false(&rps_needed)) {
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
+ int cpu = get_rps_cpu(skb->dev, skb, &rflow);
+
+ if (cpu >= 0) {
+ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+ rcu_read_unlock();
+ return ret;
+ }
+ }
+#endif
+ ret = __netif_receive_skb(skb);
+ rcu_read_unlock();
+ return ret;
+}
+
+static void netif_receive_skb_list_internal(struct list_head *head)
+{
+ struct sk_buff *skb, *next;
+ struct list_head sublist;
+
+ INIT_LIST_HEAD(&sublist);
+ list_for_each_entry_safe(skb, next, head, list) {
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
+ skb_list_del_init(skb);
+ if (!skb_defer_rx_timestamp(skb))
+ list_add_tail(&skb->list, &sublist);
+ }
+ list_splice_init(&sublist, head);
+
+ rcu_read_lock();
+#ifdef CONFIG_RPS
+ if (static_key_false(&rps_needed)) {
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
+ int cpu = get_rps_cpu(skb->dev, skb, &rflow);
+
+ if (cpu >= 0) {
+ /* Will be handled, remove from list */
+ skb_list_del_init(skb);
+ enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+ }
+ }
+ }
+#endif
+ __netif_receive_skb_list(head);
+ rcu_read_unlock();
+}
+
+/**
+ * netif_receive_skb - process receive buffer from network
+ * @skb: buffer to process
+ *
+ * netif_receive_skb() is the main receive data processing function.
+ * It always succeeds. The buffer may be dropped during processing
+ * for congestion control or by the protocol layers.
+ *
+ * This function may only be called from softirq context and interrupts
+ * should be enabled.
+ *
+ * Return values (usually ignored):
+ * NET_RX_SUCCESS: no congestion
+ * NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb(struct sk_buff *skb)
+{
+ trace_netif_receive_skb_entry(skb);
+
+ return netif_receive_skb_internal(skb);
+}
+EXPORT_SYMBOL(netif_receive_skb);
+
+/**
+ * netif_receive_skb_list - process many receive buffers from network
+ * @head: list of skbs to process.
+ *
+ * Since return value of netif_receive_skb() is normally ignored, and
+ * wouldn't be meaningful for a list, this function returns void.
+ *
+ * This function may only be called from softirq context and interrupts
+ * should be enabled.
+ */
+void netif_receive_skb_list(struct list_head *head)
+{
+ struct sk_buff *skb;
+
+ if (list_empty(head))
+ return;
+ list_for_each_entry(skb, head, list)
+ trace_netif_receive_skb_list_entry(skb);
+ netif_receive_skb_list_internal(head);
+}
+EXPORT_SYMBOL(netif_receive_skb_list);
+
+DEFINE_PER_CPU(struct work_struct, flush_works);
+
+/* Network device is going away, flush any packets still pending */
+static void flush_backlog(struct work_struct *work)
+{
+ struct sk_buff *skb, *tmp;
+ struct softnet_data *sd;
+
+ local_bh_disable();
+ sd = this_cpu_ptr(&softnet_data);
+
+ local_irq_disable();
+ rps_lock(sd);
+ skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
+ if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+ __skb_unlink(skb, &sd->input_pkt_queue);
+ dev_kfree_skb_irq(skb);
+ input_queue_head_incr(sd);
+ }
+ }
+ rps_unlock(sd);
+ local_irq_enable();
+
+ skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
+ if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+ __skb_unlink(skb, &sd->process_queue);
+ kfree_skb(skb);
+ input_queue_head_incr(sd);
+ }
+ }
+ local_bh_enable();
+}
+
+static void flush_all_backlogs(void)
+{
+ unsigned int cpu;
+
+ get_online_cpus();
+
+ for_each_online_cpu(cpu)
+ queue_work_on(cpu, system_highpri_wq,
+ per_cpu_ptr(&flush_works, cpu));
+
+ for_each_online_cpu(cpu)
+ flush_work(per_cpu_ptr(&flush_works, cpu));
+
+ put_online_cpus();
+}
+
+static int napi_gro_complete(struct sk_buff *skb)
+{
+ struct packet_offload *ptype;
+ __be16 type = skb->protocol;
+ struct list_head *head = &offload_base;
+ int err = -ENOENT;
+
+ BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
+
+ if (NAPI_GRO_CB(skb)->count == 1) {
+ skb_shinfo(skb)->gso_size = 0;
+ goto out;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+
+ err = ptype->callbacks.gro_complete(skb, 0);
+ break;
+ }
+ rcu_read_unlock();
+
+ if (err) {
+ WARN_ON(&ptype->list == head);
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+
+out:
+ return netif_receive_skb_internal(skb);
+}
+
+static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
+ bool flush_old)
+{
+ struct list_head *head = &napi->gro_hash[index].list;
+ struct sk_buff *skb, *p;
+
+ list_for_each_entry_safe_reverse(skb, p, head, list) {
+ if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
+ return;
+ list_del(&skb->list);
+ skb->next = NULL;
+ napi_gro_complete(skb);
+ napi->gro_hash[index].count--;
+ }
+
+ if (!napi->gro_hash[index].count)
+ __clear_bit(index, &napi->gro_bitmask);
+}
+
+/* napi->gro_hash[].list contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+{
+ u32 i;
+
+ for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+ if (test_bit(i, &napi->gro_bitmask))
+ __napi_gro_flush_chain(napi, i, flush_old);
+ }
+}
+EXPORT_SYMBOL(napi_gro_flush);
+
+static struct list_head *gro_list_prepare(struct napi_struct *napi,
+ struct sk_buff *skb)
+{
+ unsigned int maclen = skb->dev->hard_header_len;
+ u32 hash = skb_get_hash_raw(skb);
+ struct list_head *head;
+ struct sk_buff *p;
+
+ head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
+ list_for_each_entry(p, head, list) {
+ unsigned long diffs;
+
+ NAPI_GRO_CB(p)->flush = 0;
+
+ if (hash != skb_get_hash_raw(p)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+ diffs |= p->vlan_tci ^ skb->vlan_tci;
+ diffs |= skb_metadata_dst_cmp(p, skb);
+ diffs |= skb_metadata_differs(p, skb);
+ if (maclen == ETH_HLEN)
+ diffs |= compare_ether_header(skb_mac_header(p),
+ skb_mac_header(skb));
+ else if (!diffs)
+ diffs = memcmp(skb_mac_header(p),
+ skb_mac_header(skb),
+ maclen);
+ NAPI_GRO_CB(p)->same_flow = !diffs;
+ }
+
+ return head;
+}
+
+static void skb_gro_reset_offset(struct sk_buff *skb)
+{
+ const struct skb_shared_info *pinfo = skb_shinfo(skb);
+ const skb_frag_t *frag0 = &pinfo->frags[0];
+
+ NAPI_GRO_CB(skb)->data_offset = 0;
+ NAPI_GRO_CB(skb)->frag0 = NULL;
+ NAPI_GRO_CB(skb)->frag0_len = 0;
+
+ if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
+ pinfo->nr_frags &&
+ !PageHighMem(skb_frag_page(frag0)) &&
+ (!NET_IP_ALIGN || !(skb_frag_off(frag0) & 3))) {
+ NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
+ NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
+ skb_frag_size(frag0),
+ skb->end - skb->tail);
+ }
+}
+
+static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
+{
+ struct skb_shared_info *pinfo = skb_shinfo(skb);
+
+ BUG_ON(skb->end - skb->tail < grow);
+
+ memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
+
+ skb->data_len -= grow;
+ skb->tail += grow;
+
+ pinfo->frags[0].page_offset += grow;
+ skb_frag_size_sub(&pinfo->frags[0], grow);
+
+ if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
+ skb_frag_unref(skb, 0);
+ memmove(pinfo->frags, pinfo->frags + 1,
+ --pinfo->nr_frags * sizeof(pinfo->frags[0]));
+ }
+}
+
+static void gro_flush_oldest(struct list_head *head)
+{
+ struct sk_buff *oldest;
+
+ oldest = list_last_entry(head, struct sk_buff, list);
+
+ /* We are called with head length >= MAX_GRO_SKBS, so this is
+ * impossible.
+ */
+ if (WARN_ON_ONCE(!oldest))
+ return;
+
+ /* Do not adjust napi->gro_hash[].count, caller is adding a new
+ * SKB to the chain.
+ */
+ list_del(&oldest->list);
+ oldest->next = NULL;
+ napi_gro_complete(oldest);
+}
+
+static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+ u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ struct list_head *head = &offload_base;
+ struct packet_offload *ptype;
+ __be16 type = skb->protocol;
+ struct list_head *gro_head;
+ struct sk_buff *pp = NULL;
+ enum gro_result ret;
+ int same_flow;
+ int grow;
+
+ if (netif_elide_gro(skb->dev))
+ goto normal;
+
+ gro_head = gro_list_prepare(napi, skb);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_receive)
+ continue;
+
+ skb_set_network_header(skb, skb_gro_offset(skb));
+ skb_reset_mac_len(skb);
+ NAPI_GRO_CB(skb)->same_flow = 0;
+ NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
+ NAPI_GRO_CB(skb)->free = 0;
+ NAPI_GRO_CB(skb)->encap_mark = 0;
+ NAPI_GRO_CB(skb)->recursion_counter = 0;
+ NAPI_GRO_CB(skb)->is_fou = 0;
+ NAPI_GRO_CB(skb)->is_atomic = 1;
+ NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
+
+ /* Setup for GRO checksum validation */
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ NAPI_GRO_CB(skb)->csum = skb->csum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+ NAPI_GRO_CB(skb)->csum_cnt = 0;
+ break;
+ case CHECKSUM_UNNECESSARY:
+ NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
+ NAPI_GRO_CB(skb)->csum_valid = 0;
+ break;
+ default:
+ NAPI_GRO_CB(skb)->csum_cnt = 0;
+ NAPI_GRO_CB(skb)->csum_valid = 0;
+ }
+
+ pp = ptype->callbacks.gro_receive(gro_head, skb);
+ break;
+ }
+ rcu_read_unlock();
+
+ if (&ptype->list == head)
+ goto normal;
+
+ if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
+ ret = GRO_CONSUMED;
+ goto ok;
+ }
+
+ same_flow = NAPI_GRO_CB(skb)->same_flow;
+ ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
+
+ if (pp) {
+ list_del(&pp->list);
+ pp->next = NULL;
+ napi_gro_complete(pp);
+ napi->gro_hash[hash].count--;
+ }
+
+ if (same_flow)
+ goto ok;
+
+ if (NAPI_GRO_CB(skb)->flush)
+ goto normal;
+
+ if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
+ gro_flush_oldest(gro_head);
+ } else {
+ napi->gro_hash[hash].count++;
+ }
+ NAPI_GRO_CB(skb)->count = 1;
+ NAPI_GRO_CB(skb)->age = jiffies;
+ NAPI_GRO_CB(skb)->last = skb;
+ skb_shinfo(skb)->gso_size = skb_gro_len(skb);
+ list_add(&skb->list, gro_head);
+ ret = GRO_HELD;
+
+pull:
+ grow = skb_gro_offset(skb) - skb_headlen(skb);
+ if (grow > 0)
+ gro_pull_from_frag0(skb, grow);
+ok:
+ if (napi->gro_hash[hash].count) {
+ if (!test_bit(hash, &napi->gro_bitmask))
+ __set_bit(hash, &napi->gro_bitmask);
+ } else if (test_bit(hash, &napi->gro_bitmask)) {
+ __clear_bit(hash, &napi->gro_bitmask);
+ }
+
+ return ret;
+
+normal:
+ ret = GRO_NORMAL;
+ goto pull;
+}
+
+struct packet_offload *gro_find_receive_by_type(__be16 type)
+{
+ struct list_head *offload_head = &offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_receive)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_receive_by_type);
+
+struct packet_offload *gro_find_complete_by_type(__be16 type)
+{
+ struct list_head *offload_head = &offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_complete_by_type);
+
+static void napi_skb_free_stolen_head(struct sk_buff *skb)
+{
+ skb_dst_drop(skb);
+ secpath_reset(skb);
+ kmem_cache_free(skbuff_head_cache, skb);
+}
+
+static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
+{
+ switch (ret) {
+ case GRO_NORMAL:
+ if (netif_receive_skb_internal(skb))
+ ret = GRO_DROP;
+ break;
+
+ case GRO_DROP:
+ kfree_skb(skb);
+ break;
+
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else
+ __kfree_skb(skb);
+ break;
+
+ case GRO_HELD:
+ case GRO_MERGED:
+ case GRO_CONSUMED:
+ break;
+ }
+
+ return ret;
+}
+
+gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+ skb_mark_napi_id(skb, napi);
+ trace_napi_gro_receive_entry(skb);
+
+ skb_gro_reset_offset(skb);
+
+ return napi_skb_finish(dev_gro_receive(napi, skb), skb);
+}
+EXPORT_SYMBOL(napi_gro_receive);
+
+static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
+{
+ if (unlikely(skb->pfmemalloc)) {
+ consume_skb(skb);
+ return;
+ }
+ __skb_pull(skb, skb_headlen(skb));
+ /* restore the reserve we had after netdev_alloc_skb_ip_align() */
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
+ skb->vlan_tci = 0;
+ skb->dev = napi->dev;
+ skb->skb_iif = 0;
+
+ /* eth_type_trans() assumes pkt_type is PACKET_HOST */
+ skb->pkt_type = PACKET_HOST;
+
+ skb->encapsulation = 0;
+ skb_shinfo(skb)->gso_type = 0;
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+ secpath_reset(skb);
+
+ napi->skb = skb;
+}
+
+struct sk_buff *napi_get_frags(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+
+ if (!skb) {
+ skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
+ if (skb) {
+ napi->skb = skb;
+ skb_mark_napi_id(skb, napi);
+ }
+ }
+ return skb;
+}
+EXPORT_SYMBOL(napi_get_frags);
+
+static gro_result_t napi_frags_finish(struct napi_struct *napi,
+ struct sk_buff *skb,
+ gro_result_t ret)
+{
+ switch (ret) {
+ case GRO_NORMAL:
+ case GRO_HELD:
+ __skb_push(skb, ETH_HLEN);
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
+ ret = GRO_DROP;
+ break;
+
+ case GRO_DROP:
+ napi_reuse_skb(napi, skb);
+ break;
+
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else
+ napi_reuse_skb(napi, skb);
+ break;
+
+ case GRO_MERGED:
+ case GRO_CONSUMED:
+ break;
+ }
+
+ return ret;
+}
+
+/* Upper GRO stack assumes network header starts at gro_offset=0
+ * Drivers could call both napi_gro_frags() and napi_gro_receive()
+ * We copy ethernet header into skb->data to have a common layout.
+ */
+static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+ const struct ethhdr *eth;
+ unsigned int hlen = sizeof(*eth);
+
+ napi->skb = NULL;
+
+ skb_reset_mac_header(skb);
+ skb_gro_reset_offset(skb);
+
+ if (unlikely(skb_gro_header_hard(skb, hlen))) {
+ eth = skb_gro_header_slow(skb, hlen, 0);
+ if (unlikely(!eth)) {
+ net_warn_ratelimited("%s: dropping impossible skb from %s\n",
+ __func__, napi->dev->name);
+ napi_reuse_skb(napi, skb);
+ return NULL;
+ }
+ } else {
+ eth = (const struct ethhdr *)skb->data;
+ gro_pull_from_frag0(skb, hlen);
+ NAPI_GRO_CB(skb)->frag0 += hlen;
+ NAPI_GRO_CB(skb)->frag0_len -= hlen;
+ }
+ __skb_pull(skb, hlen);
+
+ /*
+ * This works because the only protocols we care about don't require
+ * special handling.
+ * We'll fix it up properly in napi_frags_finish()
+ */
+ skb->protocol = eth->h_proto;
+
+ return skb;
+}
+
+gro_result_t napi_gro_frags(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi_frags_skb(napi);
+
+ if (!skb)
+ return GRO_DROP;
+
+ trace_napi_gro_frags_entry(skb);
+
+ return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
+}
+EXPORT_SYMBOL(napi_gro_frags);
+
+/* Compute the checksum from gro_offset and return the folded value
+ * after adding in any pseudo checksum.
+ */
+__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
+{
+ __wsum wsum;
+ __sum16 sum;
+
+ wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
+
+ /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
+ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ }
+
+ NAPI_GRO_CB(skb)->csum = wsum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+
+ return sum;
+}
+EXPORT_SYMBOL(__skb_gro_checksum_complete);
+
+static void net_rps_send_ipi(struct softnet_data *remsd)
+{
+#ifdef CONFIG_RPS
+ while (remsd) {
+ struct softnet_data *next = remsd->rps_ipi_next;
+
+ if (cpu_online(remsd->cpu))
+ smp_call_function_single_async(remsd->cpu, &remsd->csd);
+ remsd = next;
+ }
+#endif
+}
+
+/*
+ * net_rps_action_and_irq_enable sends any pending IPI's for rps.
+ * Note: called with local irq disabled, but exits with local irq enabled.
+ */
+static void net_rps_action_and_irq_enable(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ struct softnet_data *remsd = sd->rps_ipi_list;
+
+ if (remsd) {
+ sd->rps_ipi_list = NULL;
+
+ local_irq_enable();
+
+ /* Send pending IPI's to kick RPS processing on remote cpus. */
+ net_rps_send_ipi(remsd);
+ } else
+#endif
+ local_irq_enable();
+}
+
+static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ return sd->rps_ipi_list != NULL;
+#else
+ return false;
+#endif
+}
+
+static int process_backlog(struct napi_struct *napi, int quota)
+{
+ struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
+ bool again = true;
+ int work = 0;
+
+ /* Check if we have pending ipi, its better to send them now,
+ * not waiting net_rx_action() end.
+ */
+ if (sd_has_rps_ipi_waiting(sd)) {
+ local_irq_disable();
+ net_rps_action_and_irq_enable(sd);
+ }
+
+ napi->weight = dev_rx_weight;
+ while (again) {
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(&sd->process_queue))) {
+ rcu_read_lock();
+ __netif_receive_skb(skb);
+ rcu_read_unlock();
+ input_queue_head_incr(sd);
+ if (++work >= quota)
+ return work;
+
+ }
+
+ local_irq_disable();
+ rps_lock(sd);
+ if (skb_queue_empty(&sd->input_pkt_queue)) {
+ /*
+ * Inline a custom version of __napi_complete().
+ * only current cpu owns and manipulates this napi,
+ * and NAPI_STATE_SCHED is the only possible flag set
+ * on backlog.
+ * We can use a plain write instead of clear_bit(),
+ * and we dont need an smp_mb() memory barrier.
+ */
+ napi->state = 0;
+ again = false;
+ } else {
+ skb_queue_splice_tail_init(&sd->input_pkt_queue,
+ &sd->process_queue);
+ }
+ rps_unlock(sd);
+ local_irq_enable();
+ }
+
+ return work;
+}
+
+/**
+ * __napi_schedule - schedule for receive
+ * @n: entry to schedule
+ *
+ * The entry's receive function will be scheduled to run.
+ * Consider using __napi_schedule_irqoff() if hard irqs are masked.
+ */
+void __napi_schedule(struct napi_struct *n)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__napi_schedule);
+
+/**
+ * napi_schedule_prep - check if napi can be scheduled
+ * @n: napi context
+ *
+ * Test if NAPI routine is already running, and if not mark
+ * it as running. This is used as a condition variable
+ * insure only one NAPI poll instance runs. We also make
+ * sure there is no pending NAPI disable.
+ */
+bool napi_schedule_prep(struct napi_struct *n)
+{
+ unsigned long val, new;
+
+ do {
+ val = READ_ONCE(n->state);
+ if (unlikely(val & NAPIF_STATE_DISABLE))
+ return false;
+ new = val | NAPIF_STATE_SCHED;
+
+ /* Sets STATE_MISSED bit if STATE_SCHED was already set
+ * This was suggested by Alexander Duyck, as compiler
+ * emits better code than :
+ * if (val & NAPIF_STATE_SCHED)
+ * new |= NAPIF_STATE_MISSED;
+ */
+ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
+ NAPIF_STATE_MISSED;
+ } while (cmpxchg(&n->state, val, new) != val);
+
+ return !(val & NAPIF_STATE_SCHED);
+}
+EXPORT_SYMBOL(napi_schedule_prep);
+
+/**
+ * __napi_schedule_irqoff - schedule for receive
+ * @n: entry to schedule
+ *
+ * Variant of __napi_schedule() assuming hard irqs are masked.
+ *
+ * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
+ * because the interrupt disabled assumption might not be true
+ * due to force-threaded interrupts and spinlock substitution.
+ */
+void __napi_schedule_irqoff(struct napi_struct *n)
+{
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+ else
+ __napi_schedule(n);
+}
+EXPORT_SYMBOL(__napi_schedule_irqoff);
+
+bool napi_complete_done(struct napi_struct *n, int work_done)
+{
+ unsigned long flags, val, new;
+
+ /*
+ * 1) Don't let napi dequeue from the cpu poll list
+ * just in case its running on a different cpu.
+ * 2) If we are busy polling, do nothing here, we have
+ * the guarantee we will be called later.
+ */
+ if (unlikely(n->state & (NAPIF_STATE_NPSVC |
+ NAPIF_STATE_IN_BUSY_POLL)))
+ return false;
+
+ if (n->gro_bitmask) {
+ unsigned long timeout = 0;
+
+ if (work_done)
+ timeout = n->dev->gro_flush_timeout;
+
+ /* When the NAPI instance uses a timeout and keeps postponing
+ * it, we need to bound somehow the time packets are kept in
+ * the GRO layer
+ */
+ napi_gro_flush(n, !!timeout);
+ if (timeout)
+ hrtimer_start(&n->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ }
+ if (unlikely(!list_empty(&n->poll_list))) {
+ /* If n->poll_list is not empty, we need to mask irqs */
+ local_irq_save(flags);
+ list_del_init(&n->poll_list);
+ local_irq_restore(flags);
+ }
+
+ do {
+ val = READ_ONCE(n->state);
+
+ WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
+
+ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+
+ /* If STATE_MISSED was set, leave STATE_SCHED set,
+ * because we will call napi->poll() one more time.
+ * This C code was suggested by Alexander Duyck to help gcc.
+ */
+ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
+ NAPIF_STATE_SCHED;
+ } while (cmpxchg(&n->state, val, new) != val);
+
+ if (unlikely(val & NAPIF_STATE_MISSED)) {
+ __napi_schedule(n);
+ return false;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(napi_complete_done);
+
+/* must be called under rcu_read_lock(), as we dont take a reference */
+static struct napi_struct *napi_by_id(unsigned int napi_id)
+{
+ unsigned int hash = napi_id % HASH_SIZE(napi_hash);
+ struct napi_struct *napi;
+
+ hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
+ if (napi->napi_id == napi_id)
+ return napi;
+
+ return NULL;
+}
+
+#if defined(CONFIG_NET_RX_BUSY_POLL)
+
+#define BUSY_POLL_BUDGET 8
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
+{
+ int rc;
+
+ /* Busy polling means there is a high chance device driver hard irq
+ * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
+ * set in napi_schedule_prep().
+ * Since we are about to call napi->poll() once more, we can safely
+ * clear NAPI_STATE_MISSED.
+ *
+ * Note: x86 could use a single "lock and ..." instruction
+ * to perform these two clear_bit()
+ */
+ clear_bit(NAPI_STATE_MISSED, &napi->state);
+ clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
+
+ local_bh_disable();
+
+ /* All we really want here is to re-enable device interrupts.
+ * Ideally, a new ndo_busy_poll_stop() could avoid another round.
+ */
+ rc = napi->poll(napi, BUSY_POLL_BUDGET);
+ trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
+ netpoll_poll_unlock(have_poll_lock);
+ if (rc == BUSY_POLL_BUDGET)
+ __napi_schedule(napi);
+ local_bh_enable();
+}
+
+void napi_busy_loop(unsigned int napi_id,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg)
+{
+ unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
+ int (*napi_poll)(struct napi_struct *napi, int budget);
+ void *have_poll_lock = NULL;
+ struct napi_struct *napi;
+
+restart:
+ napi_poll = NULL;
+
+ rcu_read_lock();
+
+ napi = napi_by_id(napi_id);
+ if (!napi)
+ goto out;
+
+ preempt_disable();
+ for (;;) {
+ int work = 0;
+
+ local_bh_disable();
+ if (!napi_poll) {
+ unsigned long val = READ_ONCE(napi->state);
+
+ /* If multiple threads are competing for this napi,
+ * we avoid dirtying napi->state as much as we can.
+ */
+ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
+ NAPIF_STATE_IN_BUSY_POLL))
+ goto count;
+ if (cmpxchg(&napi->state, val,
+ val | NAPIF_STATE_IN_BUSY_POLL |
+ NAPIF_STATE_SCHED) != val)
+ goto count;
+ have_poll_lock = netpoll_poll_lock(napi);
+ napi_poll = napi->poll;
+ }
+ work = napi_poll(napi, BUSY_POLL_BUDGET);
+ trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
+count:
+ if (work > 0)
+ __NET_ADD_STATS(dev_net(napi->dev),
+ LINUX_MIB_BUSYPOLLRXPACKETS, work);
+ local_bh_enable();
+
+ if (!loop_end || loop_end(loop_end_arg, start_time))
+ break;
+
+ if (unlikely(need_resched())) {
+ if (napi_poll)
+ busy_poll_stop(napi, have_poll_lock);
+ preempt_enable();
+ rcu_read_unlock();
+ cond_resched();
+ if (loop_end(loop_end_arg, start_time))
+ return;
+ goto restart;
+ }
+ cpu_relax();
+ }
+ if (napi_poll)
+ busy_poll_stop(napi, have_poll_lock);
+ preempt_enable();
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(napi_busy_loop);
+
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+static void napi_hash_add(struct napi_struct *napi)
+{
+ if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
+ test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
+ return;
+
+ spin_lock(&napi_hash_lock);
+
+ /* 0..NR_CPUS range is reserved for sender_cpu use */
+ do {
+ if (unlikely(++napi_gen_id < MIN_NAPI_ID))
+ napi_gen_id = MIN_NAPI_ID;
+ } while (napi_by_id(napi_gen_id));
+ napi->napi_id = napi_gen_id;
+
+ hlist_add_head_rcu(&napi->napi_hash_node,
+ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+
+ spin_unlock(&napi_hash_lock);
+}
+
+/* Warning : caller is responsible to make sure rcu grace period
+ * is respected before freeing memory containing @napi
+ */
+bool napi_hash_del(struct napi_struct *napi)
+{
+ bool rcu_sync_needed = false;
+
+ spin_lock(&napi_hash_lock);
+
+ if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
+ rcu_sync_needed = true;
+ hlist_del_rcu(&napi->napi_hash_node);
+ }
+ spin_unlock(&napi_hash_lock);
+ return rcu_sync_needed;
+}
+EXPORT_SYMBOL_GPL(napi_hash_del);
+
+static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
+{
+ struct napi_struct *napi;
+
+ napi = container_of(timer, struct napi_struct, timer);
+
+ /* Note : we use a relaxed variant of napi_schedule_prep() not setting
+ * NAPI_STATE_MISSED, since we do not react to a device IRQ.
+ */
+ if (napi->gro_bitmask && !napi_disable_pending(napi) &&
+ !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+ __napi_schedule_irqoff(napi);
+
+ return HRTIMER_NORESTART;
+}
+
+static void init_gro_hash(struct napi_struct *napi)
+{
+ int i;
+
+ for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+ INIT_LIST_HEAD(&napi->gro_hash[i].list);
+ napi->gro_hash[i].count = 0;
+ }
+ napi->gro_bitmask = 0;
+}
+
+void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int), int weight)
+{
+ INIT_LIST_HEAD(&napi->poll_list);
+ hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ napi->timer.function = napi_watchdog;
+ init_gro_hash(napi);
+ napi->skb = NULL;
+ napi->poll = poll;
+ if (weight > NAPI_POLL_WEIGHT)
+ pr_err_once("netif_napi_add() called with weight %d on device %s\n",
+ weight, dev->name);
+ napi->weight = weight;
+ napi->dev = dev;
+#ifdef CONFIG_NETPOLL
+ napi->poll_owner = -1;
+#endif
+ set_bit(NAPI_STATE_SCHED, &napi->state);
+ set_bit(NAPI_STATE_NPSVC, &napi->state);
+ list_add_rcu(&napi->dev_list, &dev->napi_list);
+ napi_hash_add(napi);
+}
+EXPORT_SYMBOL(netif_napi_add);
+
+void napi_disable(struct napi_struct *n)
+{
+ might_sleep();
+ set_bit(NAPI_STATE_DISABLE, &n->state);
+
+ while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
+ msleep(1);
+ while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
+ msleep(1);
+
+ hrtimer_cancel(&n->timer);
+
+ clear_bit(NAPI_STATE_DISABLE, &n->state);
+}
+EXPORT_SYMBOL(napi_disable);
+
+static void flush_gro_hash(struct napi_struct *napi)
+{
+ int i;
+
+ for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+ struct sk_buff *skb, *n;
+
+ list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
+ kfree_skb(skb);
+ napi->gro_hash[i].count = 0;
+ }
+}
+
+/* Must be called in process context */
+void netif_napi_del(struct napi_struct *napi)
+{
+ might_sleep();
+ if (napi_hash_del(napi))
+ synchronize_net();
+ list_del_init(&napi->dev_list);
+ napi_free_frags(napi);
+
+ flush_gro_hash(napi);
+ napi->gro_bitmask = 0;
+}
+EXPORT_SYMBOL(netif_napi_del);
+
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+{
+ void *have;
+ int work, weight;
+
+ list_del_init(&n->poll_list);
+
+ have = netpoll_poll_lock(n);
+
+ weight = n->weight;
+
+ /* This NAPI_STATE_SCHED test is for avoiding a race
+ * with netpoll's poll_napi(). Only the entity which
+ * obtains the lock and sees NAPI_STATE_SCHED set will
+ * actually make the ->poll() call. Therefore we avoid
+ * accidentally calling ->poll() when NAPI is not scheduled.
+ */
+ work = 0;
+ if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+ work = n->poll(n, weight);
+ trace_napi_poll(n, work, weight);
+ }
+
+ WARN_ON_ONCE(work > weight);
+
+ if (likely(work < weight))
+ goto out_unlock;
+
+ /* Drivers must not modify the NAPI state if they
+ * consume the entire weight. In such cases this code
+ * still "owns" the NAPI instance and therefore can
+ * move the instance around on the list at-will.
+ */
+ if (unlikely(napi_disable_pending(n))) {
+ napi_complete(n);
+ goto out_unlock;
+ }
+
+ if (n->gro_bitmask) {
+ /* flush too old packets
+ * If HZ < 1000, flush all packets.
+ */
+ napi_gro_flush(n, HZ >= 1000);
+ }
+
+ /* Some drivers may have called napi_schedule
+ * prior to exhausting their budget.
+ */
+ if (unlikely(!list_empty(&n->poll_list))) {
+ pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
+ n->dev ? n->dev->name : "backlog");
+ goto out_unlock;
+ }
+
+ list_add_tail(&n->poll_list, repoll);
+
+out_unlock:
+ netpoll_poll_unlock(have);
+
+ return work;
+}
+
+static __latent_entropy void net_rx_action(struct softirq_action *h)
+{
+ struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+ unsigned long time_limit = jiffies +
+ usecs_to_jiffies(netdev_budget_usecs);
+ int budget = netdev_budget;
+ LIST_HEAD(list);
+ LIST_HEAD(repoll);
+
+ local_irq_disable();
+ list_splice_init(&sd->poll_list, &list);
+ local_irq_enable();
+
+ for (;;) {
+ struct napi_struct *n;
+
+ if (list_empty(&list)) {
+ if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
+ goto out;
+ break;
+ }
+
+ n = list_first_entry(&list, struct napi_struct, poll_list);
+ budget -= napi_poll(n, &repoll);
+
+ /* If softirq window is exhausted then punt.
+ * Allow this to run for 2 jiffies since which will allow
+ * an average latency of 1.5/HZ.
+ */
+ if (unlikely(budget <= 0 ||
+ time_after_eq(jiffies, time_limit))) {
+ sd->time_squeeze++;
+ break;
+ }
+ }
+
+ local_irq_disable();
+
+ list_splice_tail_init(&sd->poll_list, &list);
+ list_splice_tail(&repoll, &list);
+ list_splice(&list, &sd->poll_list);
+ if (!list_empty(&sd->poll_list))
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+
+ net_rps_action_and_irq_enable(sd);
+out:
+ __kfree_skb_flush();
+}
+
+struct netdev_adjacent {
+ struct net_device *dev;
+
+ /* upper master flag, there can only be one master device per list */
+ bool master;
+
+ /* counter for the number of times this device was added to us */
+ u16 ref_nr;
+
+ /* private field for the users */
+ void *private;
+
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
+ struct list_head *adj_list)
+{
+ struct netdev_adjacent *adj;
+
+ list_for_each_entry(adj, adj_list, list) {
+ if (adj->dev == adj_dev)
+ return adj;
+ }
+ return NULL;
+}
+
+static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
+{
+ struct net_device *dev = data;
+
+ return upper_dev == dev;
+}
+
+/**
+ * netdev_has_upper_dev - Check if device is linked to an upper device
+ * @dev: device
+ * @upper_dev: upper device to check
+ *
+ * Find out if a device is linked to specified upper device and return true
+ * in case it is. Note that this checks only immediate upper device,
+ * not through a complete stack of devices. The caller must hold the RTNL lock.
+ */
+bool netdev_has_upper_dev(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ ASSERT_RTNL();
+
+ return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+ upper_dev);
+}
+EXPORT_SYMBOL(netdev_has_upper_dev);
+
+/**
+ * netdev_has_upper_dev_all - Check if device is linked to an upper device
+ * @dev: device
+ * @upper_dev: upper device to check
+ *
+ * Find out if a device is linked to specified upper device and return true
+ * in case it is. Note that this checks the entire upper device chain.
+ * The caller must hold rcu lock.
+ */
+
+bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+ upper_dev);
+}
+EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
+
+/**
+ * netdev_has_any_upper_dev - Check if device is linked to some device
+ * @dev: device
+ *
+ * Find out if a device is linked to an upper device and return true in case
+ * it is. The caller must hold the RTNL lock.
+ */
+bool netdev_has_any_upper_dev(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ return !list_empty(&dev->adj_list.upper);
+}
+EXPORT_SYMBOL(netdev_has_any_upper_dev);
+
+/**
+ * netdev_master_upper_dev_get - Get master upper device
+ * @dev: device
+ *
+ * Find a master upper device and return pointer to it or NULL in case
+ * it's not there. The caller must hold the RTNL lock.
+ */
+struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
+{
+ struct netdev_adjacent *upper;
+
+ ASSERT_RTNL();
+
+ if (list_empty(&dev->adj_list.upper))
+ return NULL;
+
+ upper = list_first_entry(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
+ if (likely(upper->master))
+ return upper->dev;
+ return NULL;
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_get);
+
+/**
+ * netdev_has_any_lower_dev - Check if device is linked to some device
+ * @dev: device
+ *
+ * Find out if a device is linked to a lower device and return true in case
+ * it is. The caller must hold the RTNL lock.
+ */
+static bool netdev_has_any_lower_dev(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ return !list_empty(&dev->adj_list.lower);
+}
+
+void *netdev_adjacent_get_private(struct list_head *adj_list)
+{
+ struct netdev_adjacent *adj;
+
+ adj = list_entry(adj_list, struct netdev_adjacent, list);
+
+ return adj->private;
+}
+EXPORT_SYMBOL(netdev_adjacent_get_private);
+
+/**
+ * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's upper list, starting from iter
+ * position. The caller must hold RCU read lock.
+ */
+struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *upper;
+
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
+
+ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+
+ return upper->dev;
+}
+EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
+
+static struct net_device *netdev_next_upper_dev(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *upper;
+
+ upper = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+
+ return upper->dev;
+}
+
+static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *upper;
+
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
+
+ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+
+ return upper->dev;
+}
+
+static int netdev_walk_all_upper_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+
+ now = dev;
+ iter = &dev->adj_list.upper;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, data);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ udev = netdev_next_upper_dev(now, &iter);
+ if (!udev)
+ break;
+
+ next = udev;
+ niter = &udev->adj_list.upper;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
+ }
+
+ return 0;
+}
+
+int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+
+ now = dev;
+ iter = &dev->adj_list.upper;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, data);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ udev = netdev_next_upper_dev_rcu(now, &iter);
+ if (!udev)
+ break;
+
+ next = udev;
+ niter = &udev->adj_list.upper;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
+
+/**
+ * netdev_lower_get_next_private - Get the next ->private from the
+ * lower neighbour list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent->private from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold either hold the
+ * RTNL lock or its own locking that guarantees that the neighbour lower
+ * list will remain unchanged.
+ */
+void *netdev_lower_get_next_private(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry(*iter, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = lower->list.next;
+
+ return lower->private;
+}
+EXPORT_SYMBOL(netdev_lower_get_next_private);
+
+/**
+ * netdev_lower_get_next_private_rcu - Get the next ->private from the
+ * lower neighbour list, RCU
+ * variant
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent->private from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold RCU read lock.
+ */
+void *netdev_lower_get_next_private_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->private;
+}
+EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
+
+/**
+ * netdev_lower_get_next - Get the next device from the lower neighbour
+ * list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold RTNL lock or
+ * its own locking that guarantees that the neighbour lower
+ * list will remain unchanged.
+ */
+void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry(*iter, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = lower->list.next;
+
+ return lower->dev;
+}
+EXPORT_SYMBOL(netdev_lower_get_next);
+
+static struct net_device *netdev_next_lower_dev(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->dev;
+}
+
+int netdev_walk_all_lower_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+
+ now = dev;
+ iter = &dev->adj_list.lower;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, data);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ ldev = netdev_next_lower_dev(now, &iter);
+ if (!ldev)
+ break;
+
+ next = ldev;
+ niter = &ldev->adj_list.lower;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
+
+static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->dev;
+}
+
+static u8 __netdev_upper_depth(struct net_device *dev)
+{
+ struct net_device *udev;
+ struct list_head *iter;
+ u8 max_depth = 0;
+
+ for (iter = &dev->adj_list.upper,
+ udev = netdev_next_upper_dev(dev, &iter);
+ udev;
+ udev = netdev_next_upper_dev(dev, &iter)) {
+ if (max_depth < udev->upper_level)
+ max_depth = udev->upper_level;
+ }
+
+ return max_depth;
+}
+
+static u8 __netdev_lower_depth(struct net_device *dev)
+{
+ struct net_device *ldev;
+ struct list_head *iter;
+ u8 max_depth = 0;
+
+ for (iter = &dev->adj_list.lower,
+ ldev = netdev_next_lower_dev(dev, &iter);
+ ldev;
+ ldev = netdev_next_lower_dev(dev, &iter)) {
+ if (max_depth < ldev->lower_level)
+ max_depth = ldev->lower_level;
+ }
+
+ return max_depth;
+}
+
+static int __netdev_update_upper_level(struct net_device *dev, void *data)
+{
+ dev->upper_level = __netdev_upper_depth(dev) + 1;
+ return 0;
+}
+
+static int __netdev_update_lower_level(struct net_device *dev, void *data)
+{
+ dev->lower_level = __netdev_lower_depth(dev) + 1;
+ return 0;
+}
+
+int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+
+ now = dev;
+ iter = &dev->adj_list.lower;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, data);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ ldev = netdev_next_lower_dev_rcu(now, &iter);
+ if (!ldev)
+ break;
+
+ next = ldev;
+ niter = &ldev->adj_list.lower;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
+
+/**
+ * netdev_lower_get_first_private_rcu - Get the first ->private from the
+ * lower neighbour list, RCU
+ * variant
+ * @dev: device
+ *
+ * Gets the first netdev_adjacent->private from the dev's lower neighbour
+ * list. The caller must hold RCU read lock.
+ */
+void *netdev_lower_get_first_private_rcu(struct net_device *dev)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_first_or_null_rcu(&dev->adj_list.lower,
+ struct netdev_adjacent, list);
+ if (lower)
+ return lower->private;
+ return NULL;
+}
+EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
+
+/**
+ * netdev_master_upper_dev_get_rcu - Get master upper device
+ * @dev: device
+ *
+ * Find a master upper device and return pointer to it or NULL in case
+ * it's not there. The caller must hold the RCU read lock.
+ */
+struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
+{
+ struct netdev_adjacent *upper;
+
+ upper = list_first_or_null_rcu(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
+ if (upper && likely(upper->master))
+ return upper->dev;
+ return NULL;
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
+
+static int netdev_adjacent_sysfs_add(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list)
+{
+ char linkname[IFNAMSIZ+7];
+
+ sprintf(linkname, dev_list == &dev->adj_list.upper ?
+ "upper_%s" : "lower_%s", adj_dev->name);
+ return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
+ linkname);
+}
+static void netdev_adjacent_sysfs_del(struct net_device *dev,
+ char *name,
+ struct list_head *dev_list)
+{
+ char linkname[IFNAMSIZ+7];
+
+ sprintf(linkname, dev_list == &dev->adj_list.upper ?
+ "upper_%s" : "lower_%s", name);
+ sysfs_remove_link(&(dev->dev.kobj), linkname);
+}
+
+static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list)
+{
+ return (dev_list == &dev->adj_list.upper ||
+ dev_list == &dev->adj_list.lower) &&
+ net_eq(dev_net(dev), dev_net(adj_dev));
+}
+
+static int __netdev_adjacent_dev_insert(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list,
+ void *private, bool master)
+{
+ struct netdev_adjacent *adj;
+ int ret;
+
+ adj = __netdev_find_adj(adj_dev, dev_list);
+
+ if (adj) {
+ adj->ref_nr += 1;
+ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
+ dev->name, adj_dev->name, adj->ref_nr);
+
+ return 0;
+ }
+
+ adj = kmalloc(sizeof(*adj), GFP_KERNEL);
+ if (!adj)
+ return -ENOMEM;
+
+ adj->dev = adj_dev;
+ adj->master = master;
+ adj->ref_nr = 1;
+ adj->private = private;
+ dev_hold(adj_dev);
+
+ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
+ dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
+
+ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
+ ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
+ if (ret)
+ goto free_adj;
+ }
+
+ /* Ensure that master link is always the first item in list. */
+ if (master) {
+ ret = sysfs_create_link(&(dev->dev.kobj),
+ &(adj_dev->dev.kobj), "master");
+ if (ret)
+ goto remove_symlinks;
+
+ list_add_rcu(&adj->list, dev_list);
+ } else {
+ list_add_tail_rcu(&adj->list, dev_list);
+ }
+
+ return 0;
+
+remove_symlinks:
+ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
+ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
+free_adj:
+ kfree(adj);
+ dev_put(adj_dev);
+
+ return ret;
+}
+
+static void __netdev_adjacent_dev_remove(struct net_device *dev,
+ struct net_device *adj_dev,
+ u16 ref_nr,
+ struct list_head *dev_list)
+{
+ struct netdev_adjacent *adj;
+
+ pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
+ dev->name, adj_dev->name, ref_nr);
+
+ adj = __netdev_find_adj(adj_dev, dev_list);
+
+ if (!adj) {
+ pr_err("Adjacency does not exist for device %s from %s\n",
+ dev->name, adj_dev->name);
+ WARN_ON(1);
+ return;
+ }
+
+ if (adj->ref_nr > ref_nr) {
+ pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
+ dev->name, adj_dev->name, ref_nr,
+ adj->ref_nr - ref_nr);
+ adj->ref_nr -= ref_nr;
+ return;
+ }
+
+ if (adj->master)
+ sysfs_remove_link(&(dev->dev.kobj), "master");
+
+ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
+ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
+
+ list_del_rcu(&adj->list);
+ pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
+ adj_dev->name, dev->name, adj_dev->name);
+ dev_put(adj_dev);
+ kfree_rcu(adj, rcu);
+}
+
+static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct list_head *up_list,
+ struct list_head *down_list,
+ void *private, bool master)
+{
+ int ret;
+
+ ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
+ private, master);
+ if (ret)
+ return ret;
+
+ ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
+ private, false);
+ if (ret) {
+ __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
+ struct net_device *upper_dev,
+ u16 ref_nr,
+ struct list_head *up_list,
+ struct list_head *down_list)
+{
+ __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
+ __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
+}
+
+static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
+ struct net_device *upper_dev,
+ void *private, bool master)
+{
+ return __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower,
+ private, master);
+}
+
+static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower);
+}
+
+static int __netdev_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev, bool master,
+ void *upper_priv, void *upper_info,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_changeupper_info changeupper_info = {
+ .info = {
+ .dev = dev,
+ .extack = extack,
+ },
+ .upper_dev = upper_dev,
+ .master = master,
+ .linking = true,
+ .upper_info = upper_info,
+ };
+ struct net_device *master_dev;
+ int ret = 0;
+
+ ASSERT_RTNL();
+
+ if (dev == upper_dev)
+ return -EBUSY;
+
+ /* To prevent loops, check if dev is not upper device to upper_dev. */
+ if (netdev_has_upper_dev(upper_dev, dev))
+ return -EBUSY;
+
+ if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
+ return -EMLINK;
+
+ if (!master) {
+ if (netdev_has_upper_dev(dev, upper_dev))
+ return -EEXIST;
+ } else {
+ master_dev = netdev_master_upper_dev_get(dev);
+ if (master_dev)
+ return master_dev == upper_dev ? -EEXIST : -EBUSY;
+ }
+
+ ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
+ &changeupper_info.info);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ return ret;
+
+ ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
+ master);
+ if (ret)
+ return ret;
+
+ ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
+ &changeupper_info.info);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto rollback;
+
+ __netdev_update_upper_level(dev, NULL);
+ netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+
+ __netdev_update_lower_level(upper_dev, NULL);
+ netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
+
+ return 0;
+
+rollback:
+ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
+
+ return ret;
+}
+
+/**
+ * netdev_upper_dev_link - Add a link to the upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ * @extack: netlink extended ack
+ *
+ * Adds a link to device which is upper to this one. The caller must hold
+ * the RTNL lock. On a failure a negative errno code is returned.
+ * On success the reference counts are adjusted and the function
+ * returns zero.
+ */
+int netdev_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct netlink_ext_ack *extack)
+{
+ return __netdev_upper_dev_link(dev, upper_dev, false,
+ NULL, NULL, extack);
+}
+EXPORT_SYMBOL(netdev_upper_dev_link);
+
+/**
+ * netdev_master_upper_dev_link - Add a master link to the upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ * @upper_priv: upper device private
+ * @upper_info: upper info to be passed down via notifier
+ * @extack: netlink extended ack
+ *
+ * Adds a link to device which is upper to this one. In this case, only
+ * one master upper device can be linked, although other non-master devices
+ * might be linked as well. The caller must hold the RTNL lock.
+ * On a failure a negative errno code is returned. On success the reference
+ * counts are adjusted and the function returns zero.
+ */
+int netdev_master_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev,
+ void *upper_priv, void *upper_info,
+ struct netlink_ext_ack *extack)
+{
+ return __netdev_upper_dev_link(dev, upper_dev, true,
+ upper_priv, upper_info, extack);
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_link);
+
+/**
+ * netdev_upper_dev_unlink - Removes a link to upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ *
+ * Removes a link to device which is upper to this one. The caller must hold
+ * the RTNL lock.
+ */
+void netdev_upper_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ struct netdev_notifier_changeupper_info changeupper_info = {
+ .info = {
+ .dev = dev,
+ },
+ .upper_dev = upper_dev,
+ .linking = false,
+ };
+
+ ASSERT_RTNL();
+
+ changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
+
+ call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
+ &changeupper_info.info);
+
+ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
+
+ call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
+ &changeupper_info.info);
+
+ __netdev_update_upper_level(dev, NULL);
+ netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+
+ __netdev_update_lower_level(upper_dev, NULL);
+ netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
+}
+EXPORT_SYMBOL(netdev_upper_dev_unlink);
+
+/**
+ * netdev_bonding_info_change - Dispatch event about slave change
+ * @dev: device
+ * @bonding_info: info to dispatch
+ *
+ * Send NETDEV_BONDING_INFO to netdev notifiers with info.
+ * The caller must hold the RTNL lock.
+ */
+void netdev_bonding_info_change(struct net_device *dev,
+ struct netdev_bonding_info *bonding_info)
+{
+ struct netdev_notifier_bonding_info info = {
+ .info.dev = dev,
+ };
+
+ memcpy(&info.bonding_info, bonding_info,
+ sizeof(struct netdev_bonding_info));
+ call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
+ &info.info);
+}
+EXPORT_SYMBOL(netdev_bonding_info_change);
+
+static void netdev_adjacent_add_links(struct net_device *dev)
+{
+ struct netdev_adjacent *iter;
+
+ struct net *net = dev_net(dev);
+
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ if (!net_eq(net, dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_add(dev, iter->dev,
+ &dev->adj_list.upper);
+ }
+
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ if (!net_eq(net, dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_add(dev, iter->dev,
+ &dev->adj_list.lower);
+ }
+}
+
+static void netdev_adjacent_del_links(struct net_device *dev)
+{
+ struct netdev_adjacent *iter;
+
+ struct net *net = dev_net(dev);
+
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ if (!net_eq(net, dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, dev->name,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_del(dev, iter->dev->name,
+ &dev->adj_list.upper);
+ }
+
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ if (!net_eq(net, dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, dev->name,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_del(dev, iter->dev->name,
+ &dev->adj_list.lower);
+ }
+}
+
+void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
+{
+ struct netdev_adjacent *iter;
+
+ struct net *net = dev_net(dev);
+
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ if (!net_eq(net, dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, oldname,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.lower);
+ }
+
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ if (!net_eq(net, dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, oldname,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.upper);
+ }
+}
+
+void *netdev_lower_dev_get_private(struct net_device *dev,
+ struct net_device *lower_dev)
+{
+ struct netdev_adjacent *lower;
+
+ if (!lower_dev)
+ return NULL;
+ lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
+ if (!lower)
+ return NULL;
+
+ return lower->private;
+}
+EXPORT_SYMBOL(netdev_lower_dev_get_private);
+
+
+int dev_get_nest_level(struct net_device *dev)
+{
+ struct net_device *lower = NULL;
+ struct list_head *iter;
+ int max_nest = -1;
+ int nest;
+
+ ASSERT_RTNL();
+
+ netdev_for_each_lower_dev(dev, lower, iter) {
+ nest = dev_get_nest_level(lower);
+ if (max_nest < nest)
+ max_nest = nest;
+ }
+
+ return max_nest + 1;
+}
+EXPORT_SYMBOL(dev_get_nest_level);
+
+/**
+ * netdev_lower_change - Dispatch event about lower device state change
+ * @lower_dev: device
+ * @lower_state_info: state to dispatch
+ *
+ * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
+ * The caller must hold the RTNL lock.
+ */
+void netdev_lower_state_changed(struct net_device *lower_dev,
+ void *lower_state_info)
+{
+ struct netdev_notifier_changelowerstate_info changelowerstate_info = {
+ .info.dev = lower_dev,
+ };
+
+ ASSERT_RTNL();
+ changelowerstate_info.lower_state_info = lower_state_info;
+ call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
+ &changelowerstate_info.info);
+}
+EXPORT_SYMBOL(netdev_lower_state_changed);
+
+static void dev_change_rx_flags(struct net_device *dev, int flags)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_change_rx_flags)
+ ops->ndo_change_rx_flags(dev, flags);
+}
+
+static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
+{
+ unsigned int old_flags = dev->flags;
+ kuid_t uid;
+ kgid_t gid;
+
+ ASSERT_RTNL();
+
+ dev->flags |= IFF_PROMISC;
+ dev->promiscuity += inc;
+ if (dev->promiscuity == 0) {
+ /*
+ * Avoid overflow.
+ * If inc causes overflow, untouch promisc and return error.
+ */
+ if (inc < 0)
+ dev->flags &= ~IFF_PROMISC;
+ else {
+ dev->promiscuity -= inc;
+ pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
+ dev->name);
+ return -EOVERFLOW;
+ }
+ }
+ if (dev->flags != old_flags) {
+ pr_info("device %s %s promiscuous mode\n",
+ dev->name,
+ dev->flags & IFF_PROMISC ? "entered" : "left");
+ if (audit_enabled) {
+ current_uid_gid(&uid, &gid);
+ audit_log(audit_context(), GFP_ATOMIC,
+ AUDIT_ANOM_PROMISCUOUS,
+ "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
+ dev->name, (dev->flags & IFF_PROMISC),
+ (old_flags & IFF_PROMISC),
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ from_kuid(&init_user_ns, uid),
+ from_kgid(&init_user_ns, gid),
+ audit_get_sessionid(current));
+ }
+
+ dev_change_rx_flags(dev, IFF_PROMISC);
+ }
+ if (notify)
+ __dev_notify_flags(dev, old_flags, IFF_PROMISC);
+ return 0;
+}
+
+/**
+ * dev_set_promiscuity - update promiscuity count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove promiscuity from a device. While the count in the device
+ * remains above zero the interface remains promiscuous. Once it hits zero
+ * the device reverts back to normal filtering operation. A negative inc
+ * value is used to drop promiscuity on the device.
+ * Return 0 if successful or a negative errno code on error.
+ */
+int dev_set_promiscuity(struct net_device *dev, int inc)
+{
+ unsigned int old_flags = dev->flags;
+ int err;
+
+ err = __dev_set_promiscuity(dev, inc, true);
+ if (err < 0)
+ return err;
+ if (dev->flags != old_flags)
+ dev_set_rx_mode(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_set_promiscuity);
+
+static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
+{
+ unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
+
+ ASSERT_RTNL();
+
+ dev->flags |= IFF_ALLMULTI;
+ dev->allmulti += inc;
+ if (dev->allmulti == 0) {
+ /*
+ * Avoid overflow.
+ * If inc causes overflow, untouch allmulti and return error.
+ */
+ if (inc < 0)
+ dev->flags &= ~IFF_ALLMULTI;
+ else {
+ dev->allmulti -= inc;
+ pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
+ dev->name);
+ return -EOVERFLOW;
+ }
+ }
+ if (dev->flags ^ old_flags) {
+ dev_change_rx_flags(dev, IFF_ALLMULTI);
+ dev_set_rx_mode(dev);
+ if (notify)
+ __dev_notify_flags(dev, old_flags,
+ dev->gflags ^ old_gflags);
+ }
+ return 0;
+}
+
+/**
+ * dev_set_allmulti - update allmulti count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove reception of all multicast frames to a device. While the
+ * count in the device remains above zero the interface remains listening
+ * to all interfaces. Once it hits zero the device reverts back to normal
+ * filtering operation. A negative @inc value is used to drop the counter
+ * when releasing a resource needing all multicasts.
+ * Return 0 if successful or a negative errno code on error.
+ */
+
+int dev_set_allmulti(struct net_device *dev, int inc)
+{
+ return __dev_set_allmulti(dev, inc, true);
+}
+EXPORT_SYMBOL(dev_set_allmulti);
+
+/*
+ * Upload unicast and multicast address lists to device and
+ * configure RX filtering. When the device doesn't support unicast
+ * filtering it is put in promiscuous mode while unicast addresses
+ * are present.
+ */
+void __dev_set_rx_mode(struct net_device *dev)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ /* dev_open will call this function so the list will stay sane. */
+ if (!(dev->flags&IFF_UP))
+ return;
+
+ if (!netif_device_present(dev))
+ return;
+
+ if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
+ /* Unicast addresses changes may only happen under the rtnl,
+ * therefore calling __dev_set_promiscuity here is safe.
+ */
+ if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
+ __dev_set_promiscuity(dev, 1, false);
+ dev->uc_promisc = true;
+ } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
+ __dev_set_promiscuity(dev, -1, false);
+ dev->uc_promisc = false;
+ }
+ }
+
+ if (ops->ndo_set_rx_mode)
+ ops->ndo_set_rx_mode(dev);
+}
+
+void dev_set_rx_mode(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+}
+
+/**
+ * dev_get_flags - get flags reported to userspace
+ * @dev: device
+ *
+ * Get the combination of flag bits exported through APIs to userspace.
+ */
+unsigned int dev_get_flags(const struct net_device *dev)
+{
+ unsigned int flags;
+
+ flags = (dev->flags & ~(IFF_PROMISC |
+ IFF_ALLMULTI |
+ IFF_RUNNING |
+ IFF_LOWER_UP |
+ IFF_DORMANT)) |
+ (dev->gflags & (IFF_PROMISC |
+ IFF_ALLMULTI));
+
+ if (netif_running(dev)) {
+ if (netif_oper_up(dev))
+ flags |= IFF_RUNNING;
+ if (netif_carrier_ok(dev))
+ flags |= IFF_LOWER_UP;
+ if (netif_dormant(dev))
+ flags |= IFF_DORMANT;
+ }
+
+ return flags;
+}
+EXPORT_SYMBOL(dev_get_flags);
+
+int __dev_change_flags(struct net_device *dev, unsigned int flags)
+{
+ unsigned int old_flags = dev->flags;
+ int ret;
+
+ ASSERT_RTNL();
+
+ /*
+ * Set the flags on our device.
+ */
+
+ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
+ IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
+ IFF_AUTOMEDIA)) |
+ (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
+ IFF_ALLMULTI));
+
+ /*
+ * Load in the correct multicast list now the flags have changed.
+ */
+
+ if ((old_flags ^ flags) & IFF_MULTICAST)
+ dev_change_rx_flags(dev, IFF_MULTICAST);
+
+ dev_set_rx_mode(dev);
+
+ /*
+ * Have we downed the interface. We handle IFF_UP ourselves
+ * according to user attempts to set it, rather than blindly
+ * setting it.
+ */
+
+ ret = 0;
+ if ((old_flags ^ flags) & IFF_UP) {
+ if (old_flags & IFF_UP)
+ __dev_close(dev);
+ else
+ ret = __dev_open(dev);
+ }
+
+ if ((flags ^ dev->gflags) & IFF_PROMISC) {
+ int inc = (flags & IFF_PROMISC) ? 1 : -1;
+ unsigned int old_flags = dev->flags;
+
+ dev->gflags ^= IFF_PROMISC;
+
+ if (__dev_set_promiscuity(dev, inc, false) >= 0)
+ if (dev->flags != old_flags)
+ dev_set_rx_mode(dev);
+ }
+
+ /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
+ * is important. Some (broken) drivers set IFF_PROMISC, when
+ * IFF_ALLMULTI is requested not asking us and not reporting.
+ */
+ if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
+ int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
+
+ dev->gflags ^= IFF_ALLMULTI;
+ __dev_set_allmulti(dev, inc, false);
+ }
+
+ return ret;
+}
+
+void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
+ unsigned int gchanges)
+{
+ unsigned int changes = dev->flags ^ old_flags;
+
+ if (gchanges)
+ rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
+
+ if (changes & IFF_UP) {
+ if (dev->flags & IFF_UP)
+ call_netdevice_notifiers(NETDEV_UP, dev);
+ else
+ call_netdevice_notifiers(NETDEV_DOWN, dev);
+ }
+
+ if (dev->flags & IFF_UP &&
+ (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
+ struct netdev_notifier_change_info change_info = {
+ .info = {
+ .dev = dev,
+ },
+ .flags_changed = changes,
+ };
+
+ call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
+ }
+}
+
+/**
+ * dev_change_flags - change device settings
+ * @dev: device
+ * @flags: device state flags
+ *
+ * Change settings on device based state flags. The flags are
+ * in the userspace exported format.
+ */
+int dev_change_flags(struct net_device *dev, unsigned int flags)
+{
+ int ret;
+ unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
+
+ ret = __dev_change_flags(dev, flags);
+ if (ret < 0)
+ return ret;
+
+ changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
+ __dev_notify_flags(dev, old_flags, changes);
+ return ret;
+}
+EXPORT_SYMBOL(dev_change_flags);
+
+int __dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_change_mtu)
+ return ops->ndo_change_mtu(dev, new_mtu);
+
+ /* Pairs with all the lockless reads of dev->mtu in the stack */
+ WRITE_ONCE(dev->mtu, new_mtu);
+ return 0;
+}
+EXPORT_SYMBOL(__dev_set_mtu);
+
+int dev_validate_mtu(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack)
+{
+ /* MTU must be positive, and in range */
+ if (new_mtu < 0 || new_mtu < dev->min_mtu) {
+ NL_SET_ERR_MSG(extack, "mtu less than device minimum");
+ return -EINVAL;
+ }
+
+ if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
+ NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * dev_set_mtu_ext - Change maximum transfer unit
+ * @dev: device
+ * @new_mtu: new transfer unit
+ * @extack: netlink extended ack
+ *
+ * Change the maximum transfer size of the network device.
+ */
+int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack)
+{
+ int err, orig_mtu;
+
+ if (new_mtu == dev->mtu)
+ return 0;
+
+ err = dev_validate_mtu(dev, new_mtu, extack);
+ if (err)
+ return err;
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
+ err = notifier_to_errno(err);
+ if (err)
+ return err;
+
+ orig_mtu = dev->mtu;
+ err = __dev_set_mtu(dev, new_mtu);
+
+ if (!err) {
+ err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
+ orig_mtu);
+ err = notifier_to_errno(err);
+ if (err) {
+ /* setting mtu back and notifying everyone again,
+ * so that they have a chance to revert changes.
+ */
+ __dev_set_mtu(dev, orig_mtu);
+ call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
+ new_mtu);
+ }
+ }
+ return err;
+}
+
+int dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+ struct netlink_ext_ack extack;
+ int err;
+
+ memset(&extack, 0, sizeof(extack));
+ err = dev_set_mtu_ext(dev, new_mtu, &extack);
+ if (err && extack._msg)
+ net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
+ return err;
+}
+EXPORT_SYMBOL(dev_set_mtu);
+
+/**
+ * dev_change_tx_queue_len - Change TX queue length of a netdevice
+ * @dev: device
+ * @new_len: new tx queue length
+ */
+int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
+{
+ unsigned int orig_len = dev->tx_queue_len;
+ int res;
+
+ if (new_len != (unsigned int)new_len)
+ return -ERANGE;
+
+ if (new_len != orig_len) {
+ dev->tx_queue_len = new_len;
+ res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
+ res = notifier_to_errno(res);
+ if (res)
+ goto err_rollback;
+ res = dev_qdisc_change_tx_queue_len(dev);
+ if (res)
+ goto err_rollback;
+ }
+
+ return 0;
+
+err_rollback:
+ netdev_err(dev, "refused to change device tx_queue_len\n");
+ dev->tx_queue_len = orig_len;
+ return res;
+}
+
+/**
+ * dev_set_group - Change group this device belongs to
+ * @dev: device
+ * @new_group: group this device should belong to
+ */
+void dev_set_group(struct net_device *dev, int new_group)
+{
+ dev->group = new_group;
+}
+EXPORT_SYMBOL(dev_set_group);
+
+/**
+ * dev_set_mac_address - Change Media Access Control Address
+ * @dev: device
+ * @sa: new address
+ *
+ * Change the hardware (MAC) address of the device
+ */
+int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
+
+ if (!ops->ndo_set_mac_address)
+ return -EOPNOTSUPP;
+ if (sa->sa_family != dev->type)
+ return -EINVAL;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ err = ops->ndo_set_mac_address(dev, sa);
+ if (err)
+ return err;
+ dev->addr_assign_type = NET_ADDR_SET;
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
+ return 0;
+}
+EXPORT_SYMBOL(dev_set_mac_address);
+
+/**
+ * dev_change_carrier - Change device carrier
+ * @dev: device
+ * @new_carrier: new value
+ *
+ * Change device carrier
+ */
+int dev_change_carrier(struct net_device *dev, bool new_carrier)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_change_carrier)
+ return -EOPNOTSUPP;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return ops->ndo_change_carrier(dev, new_carrier);
+}
+EXPORT_SYMBOL(dev_change_carrier);
+
+/**
+ * dev_get_phys_port_id - Get device physical port ID
+ * @dev: device
+ * @ppid: port ID
+ *
+ * Get device physical port ID
+ */
+int dev_get_phys_port_id(struct net_device *dev,
+ struct netdev_phys_item_id *ppid)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_get_phys_port_id)
+ return -EOPNOTSUPP;
+ return ops->ndo_get_phys_port_id(dev, ppid);
+}
+EXPORT_SYMBOL(dev_get_phys_port_id);
+
+/**
+ * dev_get_phys_port_name - Get device physical port name
+ * @dev: device
+ * @name: port name
+ * @len: limit of bytes to copy to name
+ *
+ * Get device physical port name
+ */
+int dev_get_phys_port_name(struct net_device *dev,
+ char *name, size_t len)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_get_phys_port_name)
+ return -EOPNOTSUPP;
+ return ops->ndo_get_phys_port_name(dev, name, len);
+}
+EXPORT_SYMBOL(dev_get_phys_port_name);
+
+/**
+ * dev_change_proto_down - update protocol port state information
+ * @dev: device
+ * @proto_down: new value
+ *
+ * This info can be used by switch drivers to set the phys state of the
+ * port.
+ */
+int dev_change_proto_down(struct net_device *dev, bool proto_down)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_change_proto_down)
+ return -EOPNOTSUPP;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return ops->ndo_change_proto_down(dev, proto_down);
+}
+EXPORT_SYMBOL(dev_change_proto_down);
+
+u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
+ enum bpf_netdev_command cmd)
+{
+ struct netdev_bpf xdp;
+
+ if (!bpf_op)
+ return 0;
+
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = cmd;
+
+ /* Query must always succeed. */
+ WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
+
+ return xdp.prog_id;
+}
+
+static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
+ struct netlink_ext_ack *extack, u32 flags,
+ struct bpf_prog *prog)
+{
+ struct netdev_bpf xdp;
+
+ memset(&xdp, 0, sizeof(xdp));
+ if (flags & XDP_FLAGS_HW_MODE)
+ xdp.command = XDP_SETUP_PROG_HW;
+ else
+ xdp.command = XDP_SETUP_PROG;
+ xdp.extack = extack;
+ xdp.flags = flags;
+ xdp.prog = prog;
+
+ return bpf_op(dev, &xdp);
+}
+
+static void dev_xdp_uninstall(struct net_device *dev)
+{
+ struct netdev_bpf xdp;
+ bpf_op_t ndo_bpf;
+
+ /* Remove generic XDP */
+ WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
+
+ /* Remove from the driver */
+ ndo_bpf = dev->netdev_ops->ndo_bpf;
+ if (!ndo_bpf)
+ return;
+
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG;
+ WARN_ON(ndo_bpf(dev, &xdp));
+ if (xdp.prog_id)
+ WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
+ NULL));
+
+ /* Remove HW offload */
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG_HW;
+ if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
+ WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
+ NULL));
+}
+
+/**
+ * dev_change_xdp_fd - set or clear a bpf program for a device rx path
+ * @dev: device
+ * @extack: netlink extended ack
+ * @fd: new program fd or negative value to clear
+ * @flags: xdp-related flags
+ *
+ * Set or clear a bpf program for a device
+ */
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+ int fd, u32 flags)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ enum bpf_netdev_command query;
+ struct bpf_prog *prog = NULL;
+ bpf_op_t bpf_op, bpf_chk;
+ int err;
+
+ ASSERT_RTNL();
+
+ query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
+
+ bpf_op = bpf_chk = ops->ndo_bpf;
+ if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
+ return -EOPNOTSUPP;
+ if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
+ bpf_op = generic_xdp_install;
+ if (bpf_op == bpf_chk)
+ bpf_chk = generic_xdp_install;
+
+ if (fd >= 0) {
+ if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) ||
+ __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW))
+ return -EEXIST;
+ if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
+ __dev_xdp_query(dev, bpf_op, query))
+ return -EBUSY;
+
+ prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
+ bpf_op == ops->ndo_bpf);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (!(flags & XDP_FLAGS_HW_MODE) &&
+ bpf_prog_is_dev_bound(prog->aux)) {
+ NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+ }
+
+ err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
+ if (err < 0 && prog)
+ bpf_prog_put(prog);
+
+ return err;
+}
+
+/**
+ * dev_new_index - allocate an ifindex
+ * @net: the applicable net namespace
+ *
+ * Returns a suitable unique value for a new device interface
+ * number. The caller must hold the rtnl semaphore or the
+ * dev_base_lock to be sure it remains unique.
+ */
+static int dev_new_index(struct net *net)
+{
+ int ifindex = net->ifindex;
+
+ for (;;) {
+ if (++ifindex <= 0)
+ ifindex = 1;
+ if (!__dev_get_by_index(net, ifindex))
+ return net->ifindex = ifindex;
+ }
+}
+
+/* Delayed registration/unregisteration */
+static LIST_HEAD(net_todo_list);
+DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
+
+static void net_set_todo(struct net_device *dev)
+{
+ list_add_tail(&dev->todo_list, &net_todo_list);
+ dev_net(dev)->dev_unreg_count++;
+}
+
+static void rollback_registered_many(struct list_head *head)
+{
+ struct net_device *dev, *tmp;
+ LIST_HEAD(close_head);
+
+ BUG_ON(dev_boot_phase);
+ ASSERT_RTNL();
+
+ list_for_each_entry_safe(dev, tmp, head, unreg_list) {
+ /* Some devices call without registering
+ * for initialization unwind. Remove those
+ * devices and proceed with the remaining.
+ */
+ if (dev->reg_state == NETREG_UNINITIALIZED) {
+ pr_debug("unregister_netdevice: device %s/%p never was registered\n",
+ dev->name, dev);
+
+ WARN_ON(1);
+ list_del(&dev->unreg_list);
+ continue;
+ }
+ dev->dismantle = true;
+ BUG_ON(dev->reg_state != NETREG_REGISTERED);
+ }
+
+ /* If device is running, close it first. */
+ list_for_each_entry(dev, head, unreg_list)
+ list_add_tail(&dev->close_list, &close_head);
+ dev_close_many(&close_head, true);
+
+ list_for_each_entry(dev, head, unreg_list) {
+ /* And unlink it from device chain. */
+ unlist_netdevice(dev);
+
+ dev->reg_state = NETREG_UNREGISTERING;
+ }
+ flush_all_backlogs();
+
+ synchronize_net();
+
+ list_for_each_entry(dev, head, unreg_list) {
+ struct sk_buff *skb = NULL;
+
+ /* Shutdown queueing discipline. */
+ dev_shutdown(dev);
+
+ dev_xdp_uninstall(dev);
+
+ /* Notify protocols, that we are about to destroy
+ * this device. They should clean all the things.
+ */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
+ GFP_KERNEL, NULL, 0);
+
+ /*
+ * Flush the unicast and multicast chains
+ */
+ dev_uc_flush(dev);
+ dev_mc_flush(dev);
+
+ if (dev->netdev_ops->ndo_uninit)
+ dev->netdev_ops->ndo_uninit(dev);
+
+ if (skb)
+ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
+
+ /* Notifier chain MUST detach us all upper devices. */
+ WARN_ON(netdev_has_any_upper_dev(dev));
+ WARN_ON(netdev_has_any_lower_dev(dev));
+
+ /* Remove entries from kobject tree */
+ netdev_unregister_kobject(dev);
+#ifdef CONFIG_XPS
+ /* Remove XPS queueing entries */
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ }
+
+ synchronize_net();
+
+ list_for_each_entry(dev, head, unreg_list)
+ dev_put(dev);
+}
+
+static void rollback_registered(struct net_device *dev)
+{
+ LIST_HEAD(single);
+
+ list_add(&dev->unreg_list, &single);
+ rollback_registered_many(&single);
+ list_del(&single);
+}
+
+static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
+ struct net_device *upper, netdev_features_t features)
+{
+ netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
+ netdev_features_t feature;
+ int feature_bit;
+
+ for_each_netdev_feature(upper_disables, feature_bit) {
+ feature = __NETIF_F_BIT(feature_bit);
+ if (!(upper->wanted_features & feature)
+ && (features & feature)) {
+ netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
+ &feature, upper->name);
+ features &= ~feature;
+ }
+ }
+
+ return features;
+}
+
+static void netdev_sync_lower_features(struct net_device *upper,
+ struct net_device *lower, netdev_features_t features)
+{
+ netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
+ netdev_features_t feature;
+ int feature_bit;
+
+ for_each_netdev_feature(upper_disables, feature_bit) {
+ feature = __NETIF_F_BIT(feature_bit);
+ if (!(features & feature) && (lower->features & feature)) {
+ netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
+ &feature, lower->name);
+ lower->wanted_features &= ~feature;
+ __netdev_update_features(lower);
+
+ if (unlikely(lower->features & feature))
+ netdev_WARN(upper, "failed to disable %pNF on %s!\n",
+ &feature, lower->name);
+ else
+ netdev_features_change(lower);
+ }
+ }
+}
+
+static netdev_features_t netdev_fix_features(struct net_device *dev,
+ netdev_features_t features)
+{
+ /* Fix illegal checksum combinations */
+ if ((features & NETIF_F_HW_CSUM) &&
+ (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ netdev_warn(dev, "mixed HW and IP checksum settings.\n");
+ features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
+ }
+
+ /* TSO requires that SG is present as well. */
+ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
+ netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
+ features &= ~NETIF_F_ALL_TSO;
+ }
+
+ if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
+ !(features & NETIF_F_IP_CSUM)) {
+ netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
+ features &= ~NETIF_F_TSO;
+ features &= ~NETIF_F_TSO_ECN;
+ }
+
+ if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
+ !(features & NETIF_F_IPV6_CSUM)) {
+ netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
+ features &= ~NETIF_F_TSO6;
+ }
+
+ /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
+ if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
+ features &= ~NETIF_F_TSO_MANGLEID;
+
+ /* TSO ECN requires that TSO is present as well. */
+ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
+ features &= ~NETIF_F_TSO_ECN;
+
+ /* Software GSO depends on SG. */
+ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
+ netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
+ features &= ~NETIF_F_GSO;
+ }
+
+ /* GSO partial features require GSO partial be set */
+ if ((features & dev->gso_partial_features) &&
+ !(features & NETIF_F_GSO_PARTIAL)) {
+ netdev_dbg(dev,
+ "Dropping partially supported GSO features since no GSO partial.\n");
+ features &= ~dev->gso_partial_features;
+ }
+
+ if (!(features & NETIF_F_RXCSUM)) {
+ /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
+ * successfully merged by hardware must also have the
+ * checksum verified by hardware. If the user does not
+ * want to enable RXCSUM, logically, we should disable GRO_HW.
+ */
+ if (features & NETIF_F_GRO_HW) {
+ netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
+ features &= ~NETIF_F_GRO_HW;
+ }
+ }
+
+ /* LRO/HW-GRO features cannot be combined with RX-FCS */
+ if (features & NETIF_F_RXFCS) {
+ if (features & NETIF_F_LRO) {
+ netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
+ features &= ~NETIF_F_LRO;
+ }
+
+ if (features & NETIF_F_GRO_HW) {
+ netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
+ features &= ~NETIF_F_GRO_HW;
+ }
+ }
+
+ if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
+ netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
+ features &= ~NETIF_F_HW_TLS_RX;
+ }
+
+ return features;
+}
+
+int __netdev_update_features(struct net_device *dev)
+{
+ struct net_device *upper, *lower;
+ netdev_features_t features;
+ struct list_head *iter;
+ int err = -1;
+
+ ASSERT_RTNL();
+
+ features = netdev_get_wanted_features(dev);
+
+ if (dev->netdev_ops->ndo_fix_features)
+ features = dev->netdev_ops->ndo_fix_features(dev, features);
+
+ /* driver might be less strict about feature dependencies */
+ features = netdev_fix_features(dev, features);
+
+ /* some features can't be enabled if they're off an an upper device */
+ netdev_for_each_upper_dev_rcu(dev, upper, iter)
+ features = netdev_sync_upper_features(dev, upper, features);
+
+ if (dev->features == features)
+ goto sync_lower;
+
+ netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
+ &dev->features, &features);
+
+ if (dev->netdev_ops->ndo_set_features)
+ err = dev->netdev_ops->ndo_set_features(dev, features);
+ else
+ err = 0;
+
+ if (unlikely(err < 0)) {
+ netdev_err(dev,
+ "set_features() failed (%d); wanted %pNF, left %pNF\n",
+ err, &features, &dev->features);
+ /* return non-0 since some features might have changed and
+ * it's better to fire a spurious notification than miss it
+ */
+ return -1;
+ }
+
+sync_lower:
+ /* some features must be disabled on lower devices when disabled
+ * on an upper device (think: bonding master or bridge)
+ */
+ netdev_for_each_lower_dev(dev, lower, iter)
+ netdev_sync_lower_features(dev, lower, features);
+
+ if (!err) {
+ netdev_features_t diff = features ^ dev->features;
+
+ if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
+ /* udp_tunnel_{get,drop}_rx_info both need
+ * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
+ * device, or they won't do anything.
+ * Thus we need to update dev->features
+ * *before* calling udp_tunnel_get_rx_info,
+ * but *after* calling udp_tunnel_drop_rx_info.
+ */
+ if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
+ dev->features = features;
+ udp_tunnel_get_rx_info(dev);
+ } else {
+ udp_tunnel_drop_rx_info(dev);
+ }
+ }
+
+ if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
+ if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
+ dev->features = features;
+ err |= vlan_get_rx_ctag_filter_info(dev);
+ } else {
+ vlan_drop_rx_ctag_filter_info(dev);
+ }
+ }
+
+ if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
+ if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
+ dev->features = features;
+ err |= vlan_get_rx_stag_filter_info(dev);
+ } else {
+ vlan_drop_rx_stag_filter_info(dev);
+ }
+ }
+
+ dev->features = features;
+ }
+
+ return err < 0 ? 0 : 1;
+}
+
+/**
+ * netdev_update_features - recalculate device features
+ * @dev: the device to check
+ *
+ * Recalculate dev->features set and send notifications if it
+ * has changed. Should be called after driver or hardware dependent
+ * conditions might have changed that influence the features.
+ */
+void netdev_update_features(struct net_device *dev)
+{
+ if (__netdev_update_features(dev))
+ netdev_features_change(dev);
+}
+EXPORT_SYMBOL(netdev_update_features);
+
+/**
+ * netdev_change_features - recalculate device features
+ * @dev: the device to check
+ *
+ * Recalculate dev->features set and send notifications even
+ * if they have not changed. Should be called instead of
+ * netdev_update_features() if also dev->vlan_features might
+ * have changed to allow the changes to be propagated to stacked
+ * VLAN devices.
+ */
+void netdev_change_features(struct net_device *dev)
+{
+ __netdev_update_features(dev);
+ netdev_features_change(dev);
+}
+EXPORT_SYMBOL(netdev_change_features);
+
+/**
+ * netif_stacked_transfer_operstate - transfer operstate
+ * @rootdev: the root or lower level device to transfer state from
+ * @dev: the device to transfer operstate to
+ *
+ * Transfer operational state from root to device. This is normally
+ * called when a stacking relationship exists between the root
+ * device and the device(a leaf device).
+ */
+void netif_stacked_transfer_operstate(const struct net_device *rootdev,
+ struct net_device *dev)
+{
+ if (rootdev->operstate == IF_OPER_DORMANT)
+ netif_dormant_on(dev);
+ else
+ netif_dormant_off(dev);
+
+ if (netif_carrier_ok(rootdev))
+ netif_carrier_on(dev);
+ else
+ netif_carrier_off(dev);
+}
+EXPORT_SYMBOL(netif_stacked_transfer_operstate);
+
+static int netif_alloc_rx_queues(struct net_device *dev)
+{
+ unsigned int i, count = dev->num_rx_queues;
+ struct netdev_rx_queue *rx;
+ size_t sz = count * sizeof(*rx);
+ int err = 0;
+
+ BUG_ON(count < 1);
+
+ rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ if (!rx)
+ return -ENOMEM;
+
+ dev->_rx = rx;
+
+ for (i = 0; i < count; i++) {
+ rx[i].dev = dev;
+
+ /* XDP RX-queue setup */
+ err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
+ if (err < 0)
+ goto err_rxq_info;
+ }
+ return 0;
+
+err_rxq_info:
+ /* Rollback successful reg's and free other resources */
+ while (i--)
+ xdp_rxq_info_unreg(&rx[i].xdp_rxq);
+ kvfree(dev->_rx);
+ dev->_rx = NULL;
+ return err;
+}
+
+static void netif_free_rx_queues(struct net_device *dev)
+{
+ unsigned int i, count = dev->num_rx_queues;
+
+ /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
+ if (!dev->_rx)
+ return;
+
+ for (i = 0; i < count; i++)
+ xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
+
+ kvfree(dev->_rx);
+}
+
+static void netdev_init_one_queue(struct net_device *dev,
+ struct netdev_queue *queue, void *_unused)
+{
+ /* Initialize queue lock */
+ spin_lock_init(&queue->_xmit_lock);
+ netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
+ queue->xmit_lock_owner = -1;
+ netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
+ queue->dev = dev;
+#ifdef CONFIG_BQL
+ dql_init(&queue->dql, HZ);
+#endif
+}
+
+static void netif_free_tx_queues(struct net_device *dev)
+{
+ kvfree(dev->_tx);
+}
+
+static int netif_alloc_netdev_queues(struct net_device *dev)
+{
+ unsigned int count = dev->num_tx_queues;
+ struct netdev_queue *tx;
+ size_t sz = count * sizeof(*tx);
+
+ if (count < 1 || count > 0xffff)
+ return -EINVAL;
+
+ tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ if (!tx)
+ return -ENOMEM;
+
+ dev->_tx = tx;
+
+ netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
+ spin_lock_init(&dev->tx_global_lock);
+
+ return 0;
+}
+
+void netif_tx_stop_all_queues(struct net_device *dev)
+{
+ unsigned int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+
+ netif_tx_stop_queue(txq);
+ }
+}
+EXPORT_SYMBOL(netif_tx_stop_all_queues);
+
+/**
+ * register_netdevice - register a network device
+ * @dev: device to register
+ *
+ * Take a completed network device structure and add it to the kernel
+ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
+ * chain. 0 is returned on success. A negative errno code is returned
+ * on a failure to set up the device, or if the name is a duplicate.
+ *
+ * Callers must hold the rtnl semaphore. You may want
+ * register_netdev() instead of this.
+ *
+ * BUGS:
+ * The locking appears insufficient to guarantee two parallel registers
+ * will not get the same name.
+ */
+
+int register_netdevice(struct net_device *dev)
+{
+ int ret;
+ struct net *net = dev_net(dev);
+
+ BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
+ NETDEV_FEATURE_COUNT);
+ BUG_ON(dev_boot_phase);
+ ASSERT_RTNL();
+
+ might_sleep();
+
+ /* When net_device's are persistent, this will be fatal. */
+ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
+ BUG_ON(!net);
+
+ spin_lock_init(&dev->addr_list_lock);
+ netdev_set_addr_lockdep_class(dev);
+
+ ret = dev_get_valid_name(net, dev, dev->name);
+ if (ret < 0)
+ goto out;
+
+ /* Init, if this function is available */
+ if (dev->netdev_ops->ndo_init) {
+ ret = dev->netdev_ops->ndo_init(dev);
+ if (ret) {
+ if (ret > 0)
+ ret = -EIO;
+ goto out;
+ }
+ }
+
+ if (((dev->hw_features | dev->features) &
+ NETIF_F_HW_VLAN_CTAG_FILTER) &&
+ (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
+ !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
+ netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
+ ret = -EINVAL;
+ goto err_uninit;
+ }
+
+ ret = -EBUSY;
+ if (!dev->ifindex)
+ dev->ifindex = dev_new_index(net);
+ else if (__dev_get_by_index(net, dev->ifindex))
+ goto err_uninit;
+
+ /* Transfer changeable features to wanted_features and enable
+ * software offloads (GSO and GRO).
+ */
+ dev->hw_features |= NETIF_F_SOFT_FEATURES;
+ dev->features |= NETIF_F_SOFT_FEATURES;
+
+ if (dev->netdev_ops->ndo_udp_tunnel_add) {
+ dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
+ dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
+ }
+
+ dev->wanted_features = dev->features & dev->hw_features;
+
+ if (!(dev->flags & IFF_LOOPBACK))
+ dev->hw_features |= NETIF_F_NOCACHE_COPY;
+
+ /* If IPv4 TCP segmentation offload is supported we should also
+ * allow the device to enable segmenting the frame with the option
+ * of ignoring a static IP ID value. This doesn't enable the
+ * feature itself but allows the user to enable it later.
+ */
+ if (dev->hw_features & NETIF_F_TSO)
+ dev->hw_features |= NETIF_F_TSO_MANGLEID;
+ if (dev->vlan_features & NETIF_F_TSO)
+ dev->vlan_features |= NETIF_F_TSO_MANGLEID;
+ if (dev->mpls_features & NETIF_F_TSO)
+ dev->mpls_features |= NETIF_F_TSO_MANGLEID;
+ if (dev->hw_enc_features & NETIF_F_TSO)
+ dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
+
+ /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
+ */
+ dev->vlan_features |= NETIF_F_HIGHDMA;
+
+ /* Make NETIF_F_SG inheritable to tunnel devices.
+ */
+ dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
+
+ /* Make NETIF_F_SG inheritable to MPLS.
+ */
+ dev->mpls_features |= NETIF_F_SG;
+
+ ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto err_uninit;
+
+ ret = netdev_register_kobject(dev);
+ if (ret) {
+ dev->reg_state = NETREG_UNREGISTERED;
+ goto err_uninit;
+ }
+ dev->reg_state = NETREG_REGISTERED;
+
+ __netdev_update_features(dev);
+
+ /*
+ * Default initial state at registry is that the
+ * device is present.
+ */
+
+ set_bit(__LINK_STATE_PRESENT, &dev->state);
+
+ linkwatch_init_dev(dev);
+
+ dev_init_scheduler(dev);
+ dev_hold(dev);
+ list_netdevice(dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
+
+ /* If the device has permanent device address, driver should
+ * set dev_addr and also addr_assign_type should be set to
+ * NET_ADDR_PERM (default value).
+ */
+ if (dev->addr_assign_type == NET_ADDR_PERM)
+ memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
+
+ /* Notify protocols, that a new device appeared. */
+ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
+ ret = notifier_to_errno(ret);
+ if (ret) {
+ rollback_registered(dev);
+ rcu_barrier();
+
+ dev->reg_state = NETREG_UNREGISTERED;
+ /* We should put the kobject that hold in
+ * netdev_unregister_kobject(), otherwise
+ * the net device cannot be freed when
+ * driver calls free_netdev(), because the
+ * kobject is being hold.
+ */
+ kobject_put(&dev->dev.kobj);
+ }
+ /*
+ * Prevent userspace races by waiting until the network
+ * device is fully setup before sending notifications.
+ */
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
+
+out:
+ return ret;
+
+err_uninit:
+ if (dev->netdev_ops->ndo_uninit)
+ dev->netdev_ops->ndo_uninit(dev);
+ if (dev->priv_destructor)
+ dev->priv_destructor(dev);
+ goto out;
+}
+EXPORT_SYMBOL(register_netdevice);
+
+/**
+ * init_dummy_netdev - init a dummy network device for NAPI
+ * @dev: device to init
+ *
+ * This takes a network device structure and initialize the minimum
+ * amount of fields so it can be used to schedule NAPI polls without
+ * registering a full blown interface. This is to be used by drivers
+ * that need to tie several hardware interfaces to a single NAPI
+ * poll scheduler due to HW limitations.
+ */
+int init_dummy_netdev(struct net_device *dev)
+{
+ /* Clear everything. Note we don't initialize spinlocks
+ * are they aren't supposed to be taken by any of the
+ * NAPI code and this dummy netdev is supposed to be
+ * only ever used for NAPI polls
+ */
+ memset(dev, 0, sizeof(struct net_device));
+
+ /* make sure we BUG if trying to hit standard
+ * register/unregister code path
+ */
+ dev->reg_state = NETREG_DUMMY;
+
+ /* NAPI wants this */
+ INIT_LIST_HEAD(&dev->napi_list);
+
+ /* a dummy interface is started by default */
+ set_bit(__LINK_STATE_PRESENT, &dev->state);
+ set_bit(__LINK_STATE_START, &dev->state);
+
+ /* napi_busy_loop stats accounting wants this */
+ dev_net_set(dev, &init_net);
+
+ /* Note : We dont allocate pcpu_refcnt for dummy devices,
+ * because users of this 'device' dont need to change
+ * its refcount.
+ */
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(init_dummy_netdev);
+
+
+/**
+ * register_netdev - register a network device
+ * @dev: device to register
+ *
+ * Take a completed network device structure and add it to the kernel
+ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
+ * chain. 0 is returned on success. A negative errno code is returned
+ * on a failure to set up the device, or if the name is a duplicate.
+ *
+ * This is a wrapper around register_netdevice that takes the rtnl semaphore
+ * and expands the device name if you passed a format string to
+ * alloc_netdev.
+ */
+int register_netdev(struct net_device *dev)
+{
+ int err;
+
+ if (rtnl_lock_killable())
+ return -EINTR;
+ err = register_netdevice(dev);
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(register_netdev);
+
+int netdev_refcnt_read(const struct net_device *dev)
+{
+ int i, refcnt = 0;
+
+ for_each_possible_cpu(i)
+ refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
+ return refcnt;
+}
+EXPORT_SYMBOL(netdev_refcnt_read);
+
+/**
+ * netdev_wait_allrefs - wait until all references are gone.
+ * @dev: target net_device
+ *
+ * This is called when unregistering network devices.
+ *
+ * Any protocol or device that holds a reference should register
+ * for netdevice notification, and cleanup and put back the
+ * reference if they receive an UNREGISTER event.
+ * We can get stuck here if buggy protocols don't correctly
+ * call dev_put.
+ */
+static void netdev_wait_allrefs(struct net_device *dev)
+{
+ unsigned long rebroadcast_time, warning_time;
+ int refcnt;
+
+ linkwatch_forget_dev(dev);
+
+ rebroadcast_time = warning_time = jiffies;
+ refcnt = netdev_refcnt_read(dev);
+
+ while (refcnt != 0) {
+ if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
+ rtnl_lock();
+
+ /* Rebroadcast unregister notification */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+
+ __rtnl_unlock();
+ rcu_barrier();
+ rtnl_lock();
+
+ if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
+ &dev->state)) {
+ /* We must not have linkwatch events
+ * pending on unregister. If this
+ * happens, we simply run the queue
+ * unscheduled, resulting in a noop
+ * for this device.
+ */
+ linkwatch_run_queue();
+ }
+
+ __rtnl_unlock();
+
+ rebroadcast_time = jiffies;
+ }
+
+ msleep(250);
+
+ refcnt = netdev_refcnt_read(dev);
+
+ if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
+ pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
+ dev->name, refcnt);
+ warning_time = jiffies;
+ }
+ }
+}
+
+/* The sequence is:
+ *
+ * rtnl_lock();
+ * ...
+ * register_netdevice(x1);
+ * register_netdevice(x2);
+ * ...
+ * unregister_netdevice(y1);
+ * unregister_netdevice(y2);
+ * ...
+ * rtnl_unlock();
+ * free_netdev(y1);
+ * free_netdev(y2);
+ *
+ * We are invoked by rtnl_unlock().
+ * This allows us to deal with problems:
+ * 1) We can delete sysfs objects which invoke hotplug
+ * without deadlocking with linkwatch via keventd.
+ * 2) Since we run with the RTNL semaphore not held, we can sleep
+ * safely in order to wait for the netdev refcnt to drop to zero.
+ *
+ * We must not return until all unregister events added during
+ * the interval the lock was held have been completed.
+ */
+void netdev_run_todo(void)
+{
+ struct list_head list;
+
+ /* Snapshot list, allow later requests */
+ list_replace_init(&net_todo_list, &list);
+
+ __rtnl_unlock();
+
+
+ /* Wait for rcu callbacks to finish before next phase */
+ if (!list_empty(&list))
+ rcu_barrier();
+
+ while (!list_empty(&list)) {
+ struct net_device *dev
+ = list_first_entry(&list, struct net_device, todo_list);
+ list_del(&dev->todo_list);
+
+ if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
+ pr_err("network todo '%s' but state %d\n",
+ dev->name, dev->reg_state);
+ dump_stack();
+ continue;
+ }
+
+ dev->reg_state = NETREG_UNREGISTERED;
+
+ netdev_wait_allrefs(dev);
+
+ /* paranoia */
+ BUG_ON(netdev_refcnt_read(dev));
+ BUG_ON(!list_empty(&dev->ptype_all));
+ BUG_ON(!list_empty(&dev->ptype_specific));
+ WARN_ON(rcu_access_pointer(dev->ip_ptr));
+ WARN_ON(rcu_access_pointer(dev->ip6_ptr));
+#if IS_ENABLED(CONFIG_DECNET)
+ WARN_ON(dev->dn_ptr);
+#endif
+ if (dev->priv_destructor)
+ dev->priv_destructor(dev);
+ if (dev->needs_free_netdev)
+ free_netdev(dev);
+
+ /* Report a network device has been unregistered */
+ rtnl_lock();
+ dev_net(dev)->dev_unreg_count--;
+ __rtnl_unlock();
+ wake_up(&netdev_unregistering_wq);
+
+ /* Free network device */
+ kobject_put(&dev->dev.kobj);
+ }
+}
+
+/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
+ * all the same fields in the same order as net_device_stats, with only
+ * the type differing, but rtnl_link_stats64 may have additional fields
+ * at the end for newer counters.
+ */
+void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
+ const struct net_device_stats *netdev_stats)
+{
+#if BITS_PER_LONG == 64
+ BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
+ memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
+ /* zero out counters that only exist in rtnl_link_stats64 */
+ memset((char *)stats64 + sizeof(*netdev_stats), 0,
+ sizeof(*stats64) - sizeof(*netdev_stats));
+#else
+ size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
+ const unsigned long *src = (const unsigned long *)netdev_stats;
+ u64 *dst = (u64 *)stats64;
+
+ BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
+ for (i = 0; i < n; i++)
+ dst[i] = src[i];
+ /* zero out counters that only exist in rtnl_link_stats64 */
+ memset((char *)stats64 + n * sizeof(u64), 0,
+ sizeof(*stats64) - n * sizeof(u64));
+#endif
+}
+EXPORT_SYMBOL(netdev_stats_to_stats64);
+
+/**
+ * dev_get_stats - get network device statistics
+ * @dev: device to get statistics from
+ * @storage: place to store stats
+ *
+ * Get network statistics from device. Return @storage.
+ * The device driver may provide its own method by setting
+ * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
+ * otherwise the internal statistics structure is used.
+ */
+struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *storage)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_get_stats64) {
+ memset(storage, 0, sizeof(*storage));
+ ops->ndo_get_stats64(dev, storage);
+ } else if (ops->ndo_get_stats) {
+ netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
+ } else {
+ netdev_stats_to_stats64(storage, &dev->stats);
+ }
+ storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
+ storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
+ storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
+ return storage;
+}
+EXPORT_SYMBOL(dev_get_stats);
+
+struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
+{
+ struct netdev_queue *queue = dev_ingress_queue(dev);
+
+#ifdef CONFIG_NET_CLS_ACT
+ if (queue)
+ return queue;
+ queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+ if (!queue)
+ return NULL;
+ netdev_init_one_queue(dev, queue, NULL);
+ RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
+ queue->qdisc_sleeping = &noop_qdisc;
+ rcu_assign_pointer(dev->ingress_queue, queue);
+#endif
+ return queue;
+}
+
+static const struct ethtool_ops default_ethtool_ops;
+
+void netdev_set_default_ethtool_ops(struct net_device *dev,
+ const struct ethtool_ops *ops)
+{
+ if (dev->ethtool_ops == &default_ethtool_ops)
+ dev->ethtool_ops = ops;
+}
+EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
+
+void netdev_freemem(struct net_device *dev)
+{
+ char *addr = (char *)dev - dev->padded;
+
+ kvfree(addr);
+}
+
+/**
+ * alloc_netdev_mqs - allocate network device
+ * @sizeof_priv: size of private data to allocate space for
+ * @name: device name format string
+ * @name_assign_type: origin of device name
+ * @setup: callback to initialize device
+ * @txqs: the number of TX subqueues to allocate
+ * @rxqs: the number of RX subqueues to allocate
+ *
+ * Allocates a struct net_device with private data area for driver use
+ * and performs basic initialization. Also allocates subqueue structs
+ * for each queue on the device.
+ */
+struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
+ unsigned char name_assign_type,
+ void (*setup)(struct net_device *),
+ unsigned int txqs, unsigned int rxqs)
+{
+ struct net_device *dev;
+ unsigned int alloc_size;
+ struct net_device *p;
+
+ BUG_ON(strlen(name) >= sizeof(dev->name));
+
+ if (txqs < 1) {
+ pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
+ return NULL;
+ }
+
+ if (rxqs < 1) {
+ pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
+ return NULL;
+ }
+
+ alloc_size = sizeof(struct net_device);
+ if (sizeof_priv) {
+ /* ensure 32-byte alignment of private area */
+ alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
+ alloc_size += sizeof_priv;
+ }
+ /* ensure 32-byte alignment of whole construct */
+ alloc_size += NETDEV_ALIGN - 1;
+
+ p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ if (!p)
+ return NULL;
+
+ dev = PTR_ALIGN(p, NETDEV_ALIGN);
+ dev->padded = (char *)dev - (char *)p;
+
+ dev->pcpu_refcnt = alloc_percpu(int);
+ if (!dev->pcpu_refcnt)
+ goto free_dev;
+
+ if (dev_addr_init(dev))
+ goto free_pcpu;
+
+ dev_mc_init(dev);
+ dev_uc_init(dev);
+
+ dev_net_set(dev, &init_net);
+
+ dev->gso_max_size = GSO_MAX_SIZE;
+ dev->gso_max_segs = GSO_MAX_SEGS;
+ dev->upper_level = 1;
+ dev->lower_level = 1;
+
+ INIT_LIST_HEAD(&dev->napi_list);
+ INIT_LIST_HEAD(&dev->unreg_list);
+ INIT_LIST_HEAD(&dev->close_list);
+ INIT_LIST_HEAD(&dev->link_watch_list);
+ INIT_LIST_HEAD(&dev->adj_list.upper);
+ INIT_LIST_HEAD(&dev->adj_list.lower);
+ INIT_LIST_HEAD(&dev->ptype_all);
+ INIT_LIST_HEAD(&dev->ptype_specific);
+#ifdef CONFIG_NET_SCHED
+ hash_init(dev->qdisc_hash);
+#endif
+ dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
+ setup(dev);
+
+ if (!dev->tx_queue_len) {
+ dev->priv_flags |= IFF_NO_QUEUE;
+ dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
+ }
+
+ dev->num_tx_queues = txqs;
+ dev->real_num_tx_queues = txqs;
+ if (netif_alloc_netdev_queues(dev))
+ goto free_all;
+
+ dev->num_rx_queues = rxqs;
+ dev->real_num_rx_queues = rxqs;
+ if (netif_alloc_rx_queues(dev))
+ goto free_all;
+
+ strcpy(dev->name, name);
+ dev->name_assign_type = name_assign_type;
+ dev->group = INIT_NETDEV_GROUP;
+ if (!dev->ethtool_ops)
+ dev->ethtool_ops = &default_ethtool_ops;
+
+ nf_hook_ingress_init(dev);
+
+ return dev;
+
+free_all:
+ free_netdev(dev);
+ return NULL;
+
+free_pcpu:
+ free_percpu(dev->pcpu_refcnt);
+free_dev:
+ netdev_freemem(dev);
+ return NULL;
+}
+EXPORT_SYMBOL(alloc_netdev_mqs);
+
+/**
+ * free_netdev - free network device
+ * @dev: device
+ *
+ * This function does the last stage of destroying an allocated device
+ * interface. The reference to the device object is released. If this
+ * is the last reference then it will be freed.Must be called in process
+ * context.
+ */
+void free_netdev(struct net_device *dev)
+{
+ struct napi_struct *p, *n;
+
+ might_sleep();
+ netif_free_tx_queues(dev);
+ netif_free_rx_queues(dev);
+
+ kfree(rcu_dereference_protected(dev->ingress_queue, 1));
+
+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
+ list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+ netif_napi_del(p);
+
+ free_percpu(dev->pcpu_refcnt);
+ dev->pcpu_refcnt = NULL;
+
+ /* Compatibility with error handling in drivers */
+ if (dev->reg_state == NETREG_UNINITIALIZED) {
+ netdev_freemem(dev);
+ return;
+ }
+
+ BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
+ dev->reg_state = NETREG_RELEASED;
+
+ /* will free via device release */
+ put_device(&dev->dev);
+}
+EXPORT_SYMBOL(free_netdev);
+
+/**
+ * synchronize_net - Synchronize with packet receive processing
+ *
+ * Wait for packets currently being received to be done.
+ * Does not block later packets from starting.
+ */
+void synchronize_net(void)
+{
+ might_sleep();
+ if (rtnl_is_locked())
+ synchronize_rcu_expedited();
+ else
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(synchronize_net);
+
+/**
+ * unregister_netdevice_queue - remove device from the kernel
+ * @dev: device
+ * @head: list
+ *
+ * This function shuts down a device interface and removes it
+ * from the kernel tables.
+ * If head not NULL, device is queued to be unregistered later.
+ *
+ * Callers must hold the rtnl semaphore. You may want
+ * unregister_netdev() instead of this.
+ */
+
+void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
+{
+ ASSERT_RTNL();
+
+ if (head) {
+ list_move_tail(&dev->unreg_list, head);
+ } else {
+ rollback_registered(dev);
+ /* Finish processing unregister after unlock */
+ net_set_todo(dev);
+ }
+}
+EXPORT_SYMBOL(unregister_netdevice_queue);
+
+/**
+ * unregister_netdevice_many - unregister many devices
+ * @head: list of devices
+ *
+ * Note: As most callers use a stack allocated list_head,
+ * we force a list_del() to make sure stack wont be corrupted later.
+ */
+void unregister_netdevice_many(struct list_head *head)
+{
+ struct net_device *dev;
+
+ if (!list_empty(head)) {
+ rollback_registered_many(head);
+ list_for_each_entry(dev, head, unreg_list)
+ net_set_todo(dev);
+ list_del(head);
+ }
+}
+EXPORT_SYMBOL(unregister_netdevice_many);
+
+/**
+ * unregister_netdev - remove device from the kernel
+ * @dev: device
+ *
+ * This function shuts down a device interface and removes it
+ * from the kernel tables.
+ *
+ * This is just a wrapper for unregister_netdevice that takes
+ * the rtnl semaphore. In general you want to use this and not
+ * unregister_netdevice.
+ */
+void unregister_netdev(struct net_device *dev)
+{
+ rtnl_lock();
+ unregister_netdevice(dev);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(unregister_netdev);
+
+/**
+ * dev_change_net_namespace - move device to different nethost namespace
+ * @dev: device
+ * @net: network namespace
+ * @pat: If not NULL name pattern to try if the current device name
+ * is already taken in the destination network namespace.
+ *
+ * This function shuts down a device interface and moves it
+ * to a new network namespace. On success 0 is returned, on
+ * a failure a netagive errno code is returned.
+ *
+ * Callers must hold the rtnl semaphore.
+ */
+
+int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
+{
+ int err, new_nsid, new_ifindex;
+
+ ASSERT_RTNL();
+
+ /* Don't allow namespace local devices to be moved. */
+ err = -EINVAL;
+ if (dev->features & NETIF_F_NETNS_LOCAL)
+ goto out;
+
+ /* Ensure the device has been registrered */
+ if (dev->reg_state != NETREG_REGISTERED)
+ goto out;
+
+ /* Get out if there is nothing todo */
+ err = 0;
+ if (net_eq(dev_net(dev), net))
+ goto out;
+
+ /* Pick the destination device name, and ensure
+ * we can use it in the destination network namespace.
+ */
+ err = -EEXIST;
+ if (__dev_get_by_name(net, dev->name)) {
+ /* We get here if we can't use the current device name */
+ if (!pat)
+ goto out;
+ err = dev_get_valid_name(net, dev, pat);
+ if (err < 0)
+ goto out;
+ }
+
+ /*
+ * And now a mini version of register_netdevice unregister_netdevice.
+ */
+
+ /* If device is running close it first. */
+ dev_close(dev);
+
+ /* And unlink it from device chain */
+ unlist_netdevice(dev);
+
+ synchronize_net();
+
+ /* Shutdown queueing discipline. */
+ dev_shutdown(dev);
+
+ /* Notify protocols, that we are about to destroy
+ * this device. They should clean all the things.
+ *
+ * Note that dev->reg_state stays at NETREG_REGISTERED.
+ * This is wanted because this way 8021q and macvlan know
+ * the device is just moving and can keep their slaves up.
+ */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+ rcu_barrier();
+
+ new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
+ /* If there is an ifindex conflict assign a new one */
+ if (__dev_get_by_index(net, dev->ifindex))
+ new_ifindex = dev_new_index(net);
+ else
+ new_ifindex = dev->ifindex;
+
+ rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
+ new_ifindex);
+
+ /*
+ * Flush the unicast and multicast chains
+ */
+ dev_uc_flush(dev);
+ dev_mc_flush(dev);
+
+ /* Send a netdev-removed uevent to the old namespace */
+ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
+ netdev_adjacent_del_links(dev);
+
+ /* Actually switch the network namespace */
+ dev_net_set(dev, net);
+ dev->ifindex = new_ifindex;
+
+ /* Send a netdev-add uevent to the new namespace */
+ kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
+ netdev_adjacent_add_links(dev);
+
+ /* Fixup kobjects */
+ err = device_rename(&dev->dev, dev->name);
+ WARN_ON(err);
+
+ /* Add the device back in the hashes */
+ list_netdevice(dev);
+
+ /* Notify protocols, that a new device appeared. */
+ call_netdevice_notifiers(NETDEV_REGISTER, dev);
+
+ /*
+ * Prevent userspace races by waiting until the network
+ * device is fully setup before sending notifications.
+ */
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
+
+ synchronize_net();
+ err = 0;
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(dev_change_net_namespace);
+
+static int dev_cpu_dead(unsigned int oldcpu)
+{
+ struct sk_buff **list_skb;
+ struct sk_buff *skb;
+ unsigned int cpu;
+ struct softnet_data *sd, *oldsd, *remsd = NULL;
+
+ local_irq_disable();
+ cpu = smp_processor_id();
+ sd = &per_cpu(softnet_data, cpu);
+ oldsd = &per_cpu(softnet_data, oldcpu);
+
+ /* Find end of our completion_queue. */
+ list_skb = &sd->completion_queue;
+ while (*list_skb)
+ list_skb = &(*list_skb)->next;
+ /* Append completion queue from offline CPU. */
+ *list_skb = oldsd->completion_queue;
+ oldsd->completion_queue = NULL;
+
+ /* Append output queue from offline CPU. */
+ if (oldsd->output_queue) {
+ *sd->output_queue_tailp = oldsd->output_queue;
+ sd->output_queue_tailp = oldsd->output_queue_tailp;
+ oldsd->output_queue = NULL;
+ oldsd->output_queue_tailp = &oldsd->output_queue;
+ }
+ /* Append NAPI poll list from offline CPU, with one exception :
+ * process_backlog() must be called by cpu owning percpu backlog.
+ * We properly handle process_queue & input_pkt_queue later.
+ */
+ while (!list_empty(&oldsd->poll_list)) {
+ struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
+ struct napi_struct,
+ poll_list);
+
+ list_del_init(&napi->poll_list);
+ if (napi->poll == process_backlog)
+ napi->state = 0;
+ else
+ ____napi_schedule(sd, napi);
+ }
+
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_enable();
+
+#ifdef CONFIG_RPS
+ remsd = oldsd->rps_ipi_list;
+ oldsd->rps_ipi_list = NULL;
+#endif
+ /* send out pending IPI's on offline CPU */
+ net_rps_send_ipi(remsd);
+
+ /* Process offline CPU's input_pkt_queue */
+ while ((skb = __skb_dequeue(&oldsd->process_queue))) {
+ netif_rx_ni(skb);
+ input_queue_head_incr(oldsd);
+ }
+ while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
+ netif_rx_ni(skb);
+ input_queue_head_incr(oldsd);
+ }
+
+ return 0;
+}
+
+/**
+ * netdev_increment_features - increment feature set by one
+ * @all: current feature set
+ * @one: new feature set
+ * @mask: mask feature set
+ *
+ * Computes a new feature set after adding a device with feature set
+ * @one to the master device with current feature set @all. Will not
+ * enable anything that is off in @mask. Returns the new feature set.
+ */
+netdev_features_t netdev_increment_features(netdev_features_t all,
+ netdev_features_t one, netdev_features_t mask)
+{
+ if (mask & NETIF_F_HW_CSUM)
+ mask |= NETIF_F_CSUM_MASK;
+ mask |= NETIF_F_VLAN_CHALLENGED;
+
+ all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
+ all &= one | ~NETIF_F_ALL_FOR_ALL;
+
+ /* If one device supports hw checksumming, set for all. */
+ if (all & NETIF_F_HW_CSUM)
+ all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
+
+ return all;
+}
+EXPORT_SYMBOL(netdev_increment_features);
+
+static struct hlist_head * __net_init netdev_create_hash(void)
+{
+ int i;
+ struct hlist_head *hash;
+
+ hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
+ if (hash != NULL)
+ for (i = 0; i < NETDEV_HASHENTRIES; i++)
+ INIT_HLIST_HEAD(&hash[i]);
+
+ return hash;
+}
+
+/* Initialize per network namespace state */
+static int __net_init netdev_init(struct net *net)
+{
+ BUILD_BUG_ON(GRO_HASH_BUCKETS >
+ 8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
+
+ if (net != &init_net)
+ INIT_LIST_HEAD(&net->dev_base_head);
+
+ net->dev_name_head = netdev_create_hash();
+ if (net->dev_name_head == NULL)
+ goto err_name;
+
+ net->dev_index_head = netdev_create_hash();
+ if (net->dev_index_head == NULL)
+ goto err_idx;
+
+ return 0;
+
+err_idx:
+ kfree(net->dev_name_head);
+err_name:
+ return -ENOMEM;
+}
+
+/**
+ * netdev_drivername - network driver for the device
+ * @dev: network device
+ *
+ * Determine network driver for device.
+ */
+const char *netdev_drivername(const struct net_device *dev)
+{
+ const struct device_driver *driver;
+ const struct device *parent;
+ const char *empty = "";
+
+ parent = dev->dev.parent;
+ if (!parent)
+ return empty;
+
+ driver = parent->driver;
+ if (driver && driver->name)
+ return driver->name;
+ return empty;
+}
+
+static void __netdev_printk(const char *level, const struct net_device *dev,
+ struct va_format *vaf)
+{
+ if (dev && dev->dev.parent) {
+ dev_printk_emit(level[1] - '0',
+ dev->dev.parent,
+ "%s %s %s%s: %pV",
+ dev_driver_string(dev->dev.parent),
+ dev_name(dev->dev.parent),
+ netdev_name(dev), netdev_reg_state(dev),
+ vaf);
+ } else if (dev) {
+ printk("%s%s%s: %pV",
+ level, netdev_name(dev), netdev_reg_state(dev), vaf);
+ } else {
+ printk("%s(NULL net_device): %pV", level, vaf);
+ }
+}
+
+void netdev_printk(const char *level, const struct net_device *dev,
+ const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ __netdev_printk(level, dev, &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(netdev_printk);
+
+#define define_netdev_printk_level(func, level) \
+void func(const struct net_device *dev, const char *fmt, ...) \
+{ \
+ struct va_format vaf; \
+ va_list args; \
+ \
+ va_start(args, fmt); \
+ \
+ vaf.fmt = fmt; \
+ vaf.va = &args; \
+ \
+ __netdev_printk(level, dev, &vaf); \
+ \
+ va_end(args); \
+} \
+EXPORT_SYMBOL(func);
+
+define_netdev_printk_level(netdev_emerg, KERN_EMERG);
+define_netdev_printk_level(netdev_alert, KERN_ALERT);
+define_netdev_printk_level(netdev_crit, KERN_CRIT);
+define_netdev_printk_level(netdev_err, KERN_ERR);
+define_netdev_printk_level(netdev_warn, KERN_WARNING);
+define_netdev_printk_level(netdev_notice, KERN_NOTICE);
+define_netdev_printk_level(netdev_info, KERN_INFO);
+
+static void __net_exit netdev_exit(struct net *net)
+{
+ kfree(net->dev_name_head);
+ kfree(net->dev_index_head);
+ if (net != &init_net)
+ WARN_ON_ONCE(!list_empty(&net->dev_base_head));
+}
+
+static struct pernet_operations __net_initdata netdev_net_ops = {
+ .init = netdev_init,
+ .exit = netdev_exit,
+};
+
+static void __net_exit default_device_exit(struct net *net)
+{
+ struct net_device *dev, *aux;
+ /*
+ * Push all migratable network devices back to the
+ * initial network namespace
+ */
+ rtnl_lock();
+ for_each_netdev_safe(net, dev, aux) {
+ int err;
+ char fb_name[IFNAMSIZ];
+
+ /* Ignore unmoveable devices (i.e. loopback) */
+ if (dev->features & NETIF_F_NETNS_LOCAL)
+ continue;
+
+ /* Leave virtual devices for the generic cleanup */
+ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
+ continue;
+
+ /* Push remaining network devices to init_net */
+ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
+ if (__dev_get_by_name(&init_net, fb_name))
+ snprintf(fb_name, IFNAMSIZ, "dev%%d");
+ err = dev_change_net_namespace(dev, &init_net, fb_name);
+ if (err) {
+ pr_emerg("%s: failed to move %s to init_net: %d\n",
+ __func__, dev->name, err);
+ BUG();
+ }
+ }
+ rtnl_unlock();
+}
+
+static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
+{
+ /* Return with the rtnl_lock held when there are no network
+ * devices unregistering in any network namespace in net_list.
+ */
+ struct net *net;
+ bool unregistering;
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(&netdev_unregistering_wq, &wait);
+ for (;;) {
+ unregistering = false;
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ if (net->dev_unreg_count > 0) {
+ unregistering = true;
+ break;
+ }
+ }
+ if (!unregistering)
+ break;
+ __rtnl_unlock();
+
+ wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+ }
+ remove_wait_queue(&netdev_unregistering_wq, &wait);
+}
+
+static void __net_exit default_device_exit_batch(struct list_head *net_list)
+{
+ /* At exit all network devices most be removed from a network
+ * namespace. Do this in the reverse order of registration.
+ * Do this across as many network namespaces as possible to
+ * improve batching efficiency.
+ */
+ struct net_device *dev;
+ struct net *net;
+ LIST_HEAD(dev_kill_list);
+
+ /* To prevent network device cleanup code from dereferencing
+ * loopback devices or network devices that have been freed
+ * wait here for all pending unregistrations to complete,
+ * before unregistring the loopback device and allowing the
+ * network namespace be freed.
+ *
+ * The netdev todo list containing all network devices
+ * unregistrations that happen in default_device_exit_batch
+ * will run in the rtnl_unlock() at the end of
+ * default_device_exit_batch.
+ */
+ rtnl_lock_unregistering(net_list);
+ list_for_each_entry(net, net_list, exit_list) {
+ for_each_netdev_reverse(net, dev) {
+ if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
+ dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
+ else
+ unregister_netdevice_queue(dev, &dev_kill_list);
+ }
+ }
+ unregister_netdevice_many(&dev_kill_list);
+ rtnl_unlock();
+}
+
+static struct pernet_operations __net_initdata default_device_ops = {
+ .exit = default_device_exit,
+ .exit_batch = default_device_exit_batch,
+};
+
+/*
+ * Initialize the DEV module. At boot time this walks the device list and
+ * unhooks any devices that fail to initialise (normally hardware not
+ * present) and leaves us with a valid list of present and active devices.
+ *
+ */
+
+/*
+ * This is called single threaded during boot, so no need
+ * to take the rtnl semaphore.
+ */
+static int __init net_dev_init(void)
+{
+ int i, rc = -ENOMEM;
+
+ BUG_ON(!dev_boot_phase);
+
+ if (dev_proc_init())
+ goto out;
+
+ if (netdev_kobject_init())
+ goto out;
+
+ INIT_LIST_HEAD(&ptype_all);
+ for (i = 0; i < PTYPE_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&ptype_base[i]);
+
+ INIT_LIST_HEAD(&offload_base);
+
+ if (register_pernet_subsys(&netdev_net_ops))
+ goto out;
+
+ /*
+ * Initialise the packet receive queues.
+ */
+
+ for_each_possible_cpu(i) {
+ struct work_struct *flush = per_cpu_ptr(&flush_works, i);
+ struct softnet_data *sd = &per_cpu(softnet_data, i);
+
+ INIT_WORK(flush, flush_backlog);
+
+ skb_queue_head_init(&sd->input_pkt_queue);
+ skb_queue_head_init(&sd->process_queue);
+#ifdef CONFIG_XFRM_OFFLOAD
+ skb_queue_head_init(&sd->xfrm_backlog);
+#endif
+ INIT_LIST_HEAD(&sd->poll_list);
+ sd->output_queue_tailp = &sd->output_queue;
+#ifdef CONFIG_RPS
+ sd->csd.func = rps_trigger_softirq;
+ sd->csd.info = sd;
+ sd->cpu = i;
+#endif
+
+ init_gro_hash(&sd->backlog);
+ sd->backlog.poll = process_backlog;
+ sd->backlog.weight = weight_p;
+ }
+
+ dev_boot_phase = 0;
+
+ /* The loopback device is special if any other network devices
+ * is present in a network namespace the loopback device must
+ * be present. Since we now dynamically allocate and free the
+ * loopback device ensure this invariant is maintained by
+ * keeping the loopback device as the first device on the
+ * list of network devices. Ensuring the loopback devices
+ * is the first device that appears and the last network device
+ * that disappears.
+ */
+ if (register_pernet_device(&loopback_net_ops))
+ goto out;
+
+ if (register_pernet_device(&default_device_ops))
+ goto out;
+
+ open_softirq(NET_TX_SOFTIRQ, net_tx_action);
+ open_softirq(NET_RX_SOFTIRQ, net_rx_action);
+
+ rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
+ NULL, dev_cpu_dead);
+ WARN_ON(rc < 0);
+ rc = 0;
+out:
+ return rc;
+}
+
+subsys_initcall(net_dev_init);
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
new file mode 100644
index 000000000..d884d8f5f
--- /dev/null
+++ b/net/core/dev_addr_lists.c
@@ -0,0 +1,851 @@
+/*
+ * net/core/dev_addr_lists.c - Functions for handling net device lists
+ * Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com>
+ *
+ * This file contains functions for working with unicast, multicast and device
+ * addresses lists.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/export.h>
+#include <linux/list.h>
+
+/*
+ * General list handling functions
+ */
+
+static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global,
+ bool sync)
+{
+ struct netdev_hw_addr *ha;
+ int alloc_size;
+
+ alloc_size = sizeof(*ha);
+ if (alloc_size < L1_CACHE_BYTES)
+ alloc_size = L1_CACHE_BYTES;
+ ha = kmalloc(alloc_size, GFP_ATOMIC);
+ if (!ha)
+ return -ENOMEM;
+ memcpy(ha->addr, addr, addr_len);
+ ha->type = addr_type;
+ ha->refcount = 1;
+ ha->global_use = global;
+ ha->synced = sync ? 1 : 0;
+ ha->sync_cnt = 0;
+ list_add_tail_rcu(&ha->list, &list->list);
+ list->count++;
+
+ return 0;
+}
+
+static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global, bool sync,
+ int sync_count)
+{
+ struct netdev_hw_addr *ha;
+
+ if (addr_len > MAX_ADDR_LEN)
+ return -EINVAL;
+
+ list_for_each_entry(ha, &list->list, list) {
+ if (ha->type == addr_type &&
+ !memcmp(ha->addr, addr, addr_len)) {
+ if (global) {
+ /* check if addr is already used as global */
+ if (ha->global_use)
+ return 0;
+ else
+ ha->global_use = true;
+ }
+ if (sync) {
+ if (ha->synced && sync_count)
+ return -EEXIST;
+ else
+ ha->synced++;
+ }
+ ha->refcount++;
+ return 0;
+ }
+ }
+
+ return __hw_addr_create_ex(list, addr, addr_len, addr_type, global,
+ sync);
+}
+
+static int __hw_addr_add(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
+{
+ return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false,
+ 0);
+}
+
+static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
+ struct netdev_hw_addr *ha, bool global,
+ bool sync)
+{
+ if (global && !ha->global_use)
+ return -ENOENT;
+
+ if (sync && !ha->synced)
+ return -ENOENT;
+
+ if (global)
+ ha->global_use = false;
+
+ if (sync)
+ ha->synced--;
+
+ if (--ha->refcount)
+ return 0;
+ list_del_rcu(&ha->list);
+ kfree_rcu(ha, rcu_head);
+ list->count--;
+ return 0;
+}
+
+static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global, bool sync)
+{
+ struct netdev_hw_addr *ha;
+
+ list_for_each_entry(ha, &list->list, list) {
+ if (!memcmp(ha->addr, addr, addr_len) &&
+ (ha->type == addr_type || !addr_type))
+ return __hw_addr_del_entry(list, ha, global, sync);
+ }
+ return -ENOENT;
+}
+
+static int __hw_addr_del(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
+{
+ return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false);
+}
+
+static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr *ha,
+ int addr_len)
+{
+ int err;
+
+ err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type,
+ false, true, ha->sync_cnt);
+ if (err && err != -EEXIST)
+ return err;
+
+ if (!err) {
+ ha->sync_cnt++;
+ ha->refcount++;
+ }
+
+ return 0;
+}
+
+static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ struct netdev_hw_addr *ha,
+ int addr_len)
+{
+ int err;
+
+ err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type,
+ false, true);
+ if (err)
+ return;
+ ha->sync_cnt--;
+ /* address on from list is not marked synced */
+ __hw_addr_del_entry(from_list, ha, false, false);
+}
+
+static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
+{
+ int err = 0;
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+ if (ha->sync_cnt == ha->refcount) {
+ __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
+ } else {
+ err = __hw_addr_sync_one(to_list, ha, addr_len);
+ if (err)
+ break;
+ }
+ }
+ return err;
+}
+
+/* This function only works where there is a strict 1-1 relationship
+ * between source and destionation of they synch. If you ever need to
+ * sync addresses to more then 1 destination, you need to use
+ * __hw_addr_sync_multiple().
+ */
+int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
+{
+ int err = 0;
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+ if (!ha->sync_cnt) {
+ err = __hw_addr_sync_one(to_list, ha, addr_len);
+ if (err)
+ break;
+ } else if (ha->refcount == 1)
+ __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
+ }
+ return err;
+}
+EXPORT_SYMBOL(__hw_addr_sync);
+
+void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+ if (ha->sync_cnt)
+ __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
+ }
+}
+EXPORT_SYMBOL(__hw_addr_unsync);
+
+/**
+ * __hw_addr_sync_dev - Synchonize device's multicast list
+ * @list: address list to syncronize
+ * @dev: device to sync
+ * @sync: function to call if address should be added
+ * @unsync: function to call if address should be removed
+ *
+ * This funciton is intended to be called from the ndo_set_rx_mode
+ * function of devices that require explicit address add/remove
+ * notifications. The unsync function may be NULL in which case
+ * the addresses requiring removal will simply be removed without
+ * any notification to the device.
+ **/
+int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*sync)(struct net_device *, const unsigned char *),
+ int (*unsync)(struct net_device *,
+ const unsigned char *))
+{
+ struct netdev_hw_addr *ha, *tmp;
+ int err;
+
+ /* first go through and flush out any stale entries */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt || ha->refcount != 1)
+ continue;
+
+ /* if unsync is defined and fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr))
+ continue;
+
+ ha->sync_cnt--;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+
+ /* go through and sync new entries to the list */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (ha->sync_cnt)
+ continue;
+
+ err = sync(dev, ha->addr);
+ if (err)
+ return err;
+
+ ha->sync_cnt++;
+ ha->refcount++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__hw_addr_sync_dev);
+
+/**
+ * __hw_addr_unsync_dev - Remove synchronized addresses from device
+ * @list: address list to remove synchronized addresses from
+ * @dev: device to sync
+ * @unsync: function to call if address should be removed
+ *
+ * Remove all addresses that were added to the device by __hw_addr_sync_dev().
+ * This function is intended to be called from the ndo_stop or ndo_open
+ * functions on devices that require explicit address add/remove
+ * notifications. If the unsync function pointer is NULL then this function
+ * can be used to just reset the sync_cnt for the addresses in the list.
+ **/
+void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*unsync)(struct net_device *,
+ const unsigned char *))
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt)
+ continue;
+
+ /* if unsync is defined and fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr))
+ continue;
+
+ ha->sync_cnt--;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+}
+EXPORT_SYMBOL(__hw_addr_unsync_dev);
+
+static void __hw_addr_flush(struct netdev_hw_addr_list *list)
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ list_del_rcu(&ha->list);
+ kfree_rcu(ha, rcu_head);
+ }
+ list->count = 0;
+}
+
+void __hw_addr_init(struct netdev_hw_addr_list *list)
+{
+ INIT_LIST_HEAD(&list->list);
+ list->count = 0;
+}
+EXPORT_SYMBOL(__hw_addr_init);
+
+/*
+ * Device addresses handling functions
+ */
+
+/**
+ * dev_addr_flush - Flush device address list
+ * @dev: device
+ *
+ * Flush device address list and reset ->dev_addr.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+void dev_addr_flush(struct net_device *dev)
+{
+ /* rtnl_mutex must be held here */
+
+ __hw_addr_flush(&dev->dev_addrs);
+ dev->dev_addr = NULL;
+}
+EXPORT_SYMBOL(dev_addr_flush);
+
+/**
+ * dev_addr_init - Init device address list
+ * @dev: device
+ *
+ * Init device address list and create the first element,
+ * used by ->dev_addr.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_init(struct net_device *dev)
+{
+ unsigned char addr[MAX_ADDR_LEN];
+ struct netdev_hw_addr *ha;
+ int err;
+
+ /* rtnl_mutex must be held here */
+
+ __hw_addr_init(&dev->dev_addrs);
+ memset(addr, 0, sizeof(addr));
+ err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
+ NETDEV_HW_ADDR_T_LAN);
+ if (!err) {
+ /*
+ * Get the first (previously created) address from the list
+ * and set dev_addr pointer to this location.
+ */
+ ha = list_first_entry(&dev->dev_addrs.list,
+ struct netdev_hw_addr, list);
+ dev->dev_addr = ha->addr;
+ }
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_init);
+
+/**
+ * dev_addr_add - Add a device address
+ * @dev: device
+ * @addr: address to add
+ * @addr_type: address type
+ *
+ * Add a device address to the device or increase the reference count if
+ * it already exists.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, const unsigned char *addr,
+ unsigned char addr_type)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ * dev_addr_del - Release a device address.
+ * @dev: device
+ * @addr: address to delete
+ * @addr_type: address type
+ *
+ * Release reference to a device address and remove it from the device
+ * if the reference count drops to zero.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, const unsigned char *addr,
+ unsigned char addr_type)
+{
+ int err;
+ struct netdev_hw_addr *ha;
+
+ ASSERT_RTNL();
+
+ /*
+ * We can not remove the first address from the list because
+ * dev->dev_addr points to that.
+ */
+ ha = list_first_entry(&dev->dev_addrs.list,
+ struct netdev_hw_addr, list);
+ if (!memcmp(ha->addr, addr, dev->addr_len) &&
+ ha->type == addr_type && ha->refcount == 1)
+ return -ENOENT;
+
+ err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
+ addr_type);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/*
+ * Unicast list handling functions
+ */
+
+/**
+ * dev_uc_add_excl - Add a global secondary unicast address
+ * @dev: device
+ * @addr: address to add
+ */
+int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
+{
+ struct netdev_hw_addr *ha;
+ int err;
+
+ netif_addr_lock_bh(dev);
+ list_for_each_entry(ha, &dev->uc.list, list) {
+ if (!memcmp(ha->addr, addr, dev->addr_len) &&
+ ha->type == NETDEV_HW_ADDR_T_UNICAST) {
+ err = -EEXIST;
+ goto out;
+ }
+ }
+ err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST, true, false);
+ if (!err)
+ __dev_set_rx_mode(dev);
+out:
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_add_excl);
+
+/**
+ * dev_uc_add - Add a secondary unicast address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a secondary unicast address to the device or increase
+ * the reference count if it already exists.
+ */
+int dev_uc_add(struct net_device *dev, const unsigned char *addr)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_add);
+
+/**
+ * dev_uc_del - Release secondary unicast address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a secondary unicast address and remove it
+ * from the device if the reference count drops to zero.
+ */
+int dev_uc_del(struct net_device *dev, const unsigned char *addr)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_del);
+
+/**
+ * dev_uc_sync - Synchronize device's unicast list to another device
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have no users left. The source device must be
+ * locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the dev->set_rx_mode
+ * function of layered software devices. This function assumes that
+ * addresses will only ever be synced to the @to devices and no other.
+ */
+int dev_uc_sync(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock_nested(to);
+ err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_sync);
+
+/**
+ * dev_uc_sync_multiple - Synchronize device's unicast list to another
+ * device, but allow for multiple calls to sync to multiple devices.
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have been deleted from the source. The source device
+ * must be locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the dev->set_rx_mode
+ * function of layered software devices. It allows for a single source
+ * device to be synced to multiple destination devices.
+ */
+int dev_uc_sync_multiple(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock_nested(to);
+ err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_sync_multiple);
+
+/**
+ * dev_uc_unsync - Remove synchronized addresses from the destination device
+ * @to: destination device
+ * @from: source device
+ *
+ * Remove all addresses that were added to the destination device by
+ * dev_uc_sync(). This function is intended to be called from the
+ * dev->stop function of layered software devices.
+ */
+void dev_uc_unsync(struct net_device *to, struct net_device *from)
+{
+ if (to->addr_len != from->addr_len)
+ return;
+
+ netif_addr_lock_bh(from);
+ netif_addr_lock_nested(to);
+ __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ netif_addr_unlock_bh(from);
+}
+EXPORT_SYMBOL(dev_uc_unsync);
+
+/**
+ * dev_uc_flush - Flush unicast addresses
+ * @dev: device
+ *
+ * Flush unicast addresses.
+ */
+void dev_uc_flush(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __hw_addr_flush(&dev->uc);
+ netif_addr_unlock_bh(dev);
+}
+EXPORT_SYMBOL(dev_uc_flush);
+
+/**
+ * dev_uc_flush - Init unicast address list
+ * @dev: device
+ *
+ * Init unicast address list.
+ */
+void dev_uc_init(struct net_device *dev)
+{
+ __hw_addr_init(&dev->uc);
+}
+EXPORT_SYMBOL(dev_uc_init);
+
+/*
+ * Multicast list handling functions
+ */
+
+/**
+ * dev_mc_add_excl - Add a global secondary multicast address
+ * @dev: device
+ * @addr: address to add
+ */
+int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
+{
+ struct netdev_hw_addr *ha;
+ int err;
+
+ netif_addr_lock_bh(dev);
+ list_for_each_entry(ha, &dev->mc.list, list) {
+ if (!memcmp(ha->addr, addr, dev->addr_len) &&
+ ha->type == NETDEV_HW_ADDR_T_MULTICAST) {
+ err = -EEXIST;
+ goto out;
+ }
+ }
+ err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, true, false);
+ if (!err)
+ __dev_set_rx_mode(dev);
+out:
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_mc_add_excl);
+
+static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,
+ bool global)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, global, false, 0);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+/**
+ * dev_mc_add - Add a multicast address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a multicast address to the device or increase
+ * the reference count if it already exists.
+ */
+int dev_mc_add(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_add(dev, addr, false);
+}
+EXPORT_SYMBOL(dev_mc_add);
+
+/**
+ * dev_mc_add_global - Add a global multicast address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a global multicast address to the device.
+ */
+int dev_mc_add_global(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_add(dev, addr, true);
+}
+EXPORT_SYMBOL(dev_mc_add_global);
+
+static int __dev_mc_del(struct net_device *dev, const unsigned char *addr,
+ bool global)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, global, false);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+
+/**
+ * dev_mc_del - Delete a multicast address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a multicast address and remove it
+ * from the device if the reference count drops to zero.
+ */
+int dev_mc_del(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_del(dev, addr, false);
+}
+EXPORT_SYMBOL(dev_mc_del);
+
+/**
+ * dev_mc_del_global - Delete a global multicast address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a multicast address and remove it
+ * from the device if the reference count drops to zero.
+ */
+int dev_mc_del_global(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_del(dev, addr, true);
+}
+EXPORT_SYMBOL(dev_mc_del_global);
+
+/**
+ * dev_mc_sync - Synchronize device's multicast list to another device
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have no users left. The source device must be
+ * locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of layered software devices.
+ */
+int dev_mc_sync(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock_nested(to);
+ err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_mc_sync);
+
+/**
+ * dev_mc_sync_multiple - Synchronize device's multicast list to another
+ * device, but allow for multiple calls to sync to multiple devices.
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have no users left. The source device must be
+ * locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of layered software devices. It allows for a single
+ * source device to be synced to multiple destination devices.
+ */
+int dev_mc_sync_multiple(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock_nested(to);
+ err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_mc_sync_multiple);
+
+/**
+ * dev_mc_unsync - Remove synchronized addresses from the destination device
+ * @to: destination device
+ * @from: source device
+ *
+ * Remove all addresses that were added to the destination device by
+ * dev_mc_sync(). This function is intended to be called from the
+ * dev->stop function of layered software devices.
+ */
+void dev_mc_unsync(struct net_device *to, struct net_device *from)
+{
+ if (to->addr_len != from->addr_len)
+ return;
+
+ netif_addr_lock_bh(from);
+ netif_addr_lock_nested(to);
+ __hw_addr_unsync(&to->mc, &from->mc, to->addr_len);
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ netif_addr_unlock_bh(from);
+}
+EXPORT_SYMBOL(dev_mc_unsync);
+
+/**
+ * dev_mc_flush - Flush multicast addresses
+ * @dev: device
+ *
+ * Flush multicast addresses.
+ */
+void dev_mc_flush(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __hw_addr_flush(&dev->mc);
+ netif_addr_unlock_bh(dev);
+}
+EXPORT_SYMBOL(dev_mc_flush);
+
+/**
+ * dev_mc_init - Init multicast address list
+ * @dev: device
+ *
+ * Init multicast address list.
+ */
+void dev_mc_init(struct net_device *dev)
+{
+ __hw_addr_init(&dev->mc);
+}
+EXPORT_SYMBOL(dev_mc_init);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
new file mode 100644
index 000000000..90e8aa368
--- /dev/null
+++ b/net/core/dev_ioctl.c
@@ -0,0 +1,519 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kmod.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/net_tstamp.h>
+#include <linux/wireless.h>
+#include <net/wext.h>
+
+/*
+ * Map an interface index to its name (SIOCGIFNAME)
+ */
+
+/*
+ * We need this ioctl for efficient implementation of the
+ * if_indextoname() function required by the IPv6 API. Without
+ * it, we would have to search all the interfaces to find a
+ * match. --pb
+ */
+
+static int dev_ifname(struct net *net, struct ifreq *ifr)
+{
+ ifr->ifr_name[IFNAMSIZ-1] = 0;
+ return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex);
+}
+
+static gifconf_func_t *gifconf_list[NPROTO];
+
+/**
+ * register_gifconf - register a SIOCGIF handler
+ * @family: Address family
+ * @gifconf: Function handler
+ *
+ * Register protocol dependent address dumping routines. The handler
+ * that is passed must not be freed or reused until it has been replaced
+ * by another handler.
+ */
+int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
+{
+ if (family >= NPROTO)
+ return -EINVAL;
+ gifconf_list[family] = gifconf;
+ return 0;
+}
+EXPORT_SYMBOL(register_gifconf);
+
+/*
+ * Perform a SIOCGIFCONF call. This structure will change
+ * size eventually, and there is nothing I can do about it.
+ * Thus we will need a 'compatibility mode'.
+ */
+
+int dev_ifconf(struct net *net, struct ifconf *ifc, int size)
+{
+ struct net_device *dev;
+ char __user *pos;
+ int len;
+ int total;
+ int i;
+
+ /*
+ * Fetch the caller's info block.
+ */
+
+ pos = ifc->ifc_buf;
+ len = ifc->ifc_len;
+
+ /*
+ * Loop over the interfaces, and write an info block for each.
+ */
+
+ total = 0;
+ for_each_netdev(net, dev) {
+ for (i = 0; i < NPROTO; i++) {
+ if (gifconf_list[i]) {
+ int done;
+ if (!pos)
+ done = gifconf_list[i](dev, NULL, 0, size);
+ else
+ done = gifconf_list[i](dev, pos + total,
+ len - total, size);
+ if (done < 0)
+ return -EFAULT;
+ total += done;
+ }
+ }
+ }
+
+ /*
+ * All done. Write the updated control block back to the caller.
+ */
+ ifc->ifc_len = total;
+
+ /*
+ * Both BSD and Solaris return 0 here, so we do too.
+ */
+ return 0;
+}
+
+/*
+ * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
+ */
+static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
+{
+ int err;
+ struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
+
+ if (!dev)
+ return -ENODEV;
+
+ switch (cmd) {
+ case SIOCGIFFLAGS: /* Get interface flags */
+ ifr->ifr_flags = (short) dev_get_flags(dev);
+ return 0;
+
+ case SIOCGIFMETRIC: /* Get the metric on the interface
+ (currently unused) */
+ ifr->ifr_metric = 0;
+ return 0;
+
+ case SIOCGIFMTU: /* Get the MTU of a device */
+ ifr->ifr_mtu = dev->mtu;
+ return 0;
+
+ case SIOCGIFHWADDR:
+ if (!dev->addr_len)
+ memset(ifr->ifr_hwaddr.sa_data, 0,
+ sizeof(ifr->ifr_hwaddr.sa_data));
+ else
+ memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
+ min(sizeof(ifr->ifr_hwaddr.sa_data),
+ (size_t)dev->addr_len));
+ ifr->ifr_hwaddr.sa_family = dev->type;
+ return 0;
+
+ case SIOCGIFSLAVE:
+ err = -EINVAL;
+ break;
+
+ case SIOCGIFMAP:
+ ifr->ifr_map.mem_start = dev->mem_start;
+ ifr->ifr_map.mem_end = dev->mem_end;
+ ifr->ifr_map.base_addr = dev->base_addr;
+ ifr->ifr_map.irq = dev->irq;
+ ifr->ifr_map.dma = dev->dma;
+ ifr->ifr_map.port = dev->if_port;
+ return 0;
+
+ case SIOCGIFINDEX:
+ ifr->ifr_ifindex = dev->ifindex;
+ return 0;
+
+ case SIOCGIFTXQLEN:
+ ifr->ifr_qlen = dev->tx_queue_len;
+ return 0;
+
+ default:
+ /* dev_ioctl() should ensure this case
+ * is never reached
+ */
+ WARN_ON(1);
+ err = -ENOTTY;
+ break;
+
+ }
+ return err;
+}
+
+static int net_hwtstamp_validate(struct ifreq *ifr)
+{
+ struct hwtstamp_config cfg;
+ enum hwtstamp_tx_types tx_type;
+ enum hwtstamp_rx_filters rx_filter;
+ int tx_type_valid = 0;
+ int rx_filter_valid = 0;
+
+ if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
+ return -EFAULT;
+
+ if (cfg.flags) /* reserved for future extensions */
+ return -EINVAL;
+
+ tx_type = cfg.tx_type;
+ rx_filter = cfg.rx_filter;
+
+ switch (tx_type) {
+ case HWTSTAMP_TX_OFF:
+ case HWTSTAMP_TX_ON:
+ case HWTSTAMP_TX_ONESTEP_SYNC:
+ tx_type_valid = 1;
+ break;
+ }
+
+ switch (rx_filter) {
+ case HWTSTAMP_FILTER_NONE:
+ case HWTSTAMP_FILTER_ALL:
+ case HWTSTAMP_FILTER_SOME:
+ case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+ case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+ case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+ case HWTSTAMP_FILTER_NTP_ALL:
+ rx_filter_valid = 1;
+ break;
+ }
+
+ if (!tx_type_valid || !rx_filter_valid)
+ return -ERANGE;
+
+ return 0;
+}
+
+/*
+ * Perform the SIOCxIFxxx calls, inside rtnl_lock()
+ */
+static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
+{
+ int err;
+ struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+ const struct net_device_ops *ops;
+
+ if (!dev)
+ return -ENODEV;
+
+ ops = dev->netdev_ops;
+
+ switch (cmd) {
+ case SIOCSIFFLAGS: /* Set interface flags */
+ return dev_change_flags(dev, ifr->ifr_flags);
+
+ case SIOCSIFMETRIC: /* Set the metric on the interface
+ (currently unused) */
+ return -EOPNOTSUPP;
+
+ case SIOCSIFMTU: /* Set the MTU of a device */
+ return dev_set_mtu(dev, ifr->ifr_mtu);
+
+ case SIOCSIFHWADDR:
+ if (dev->addr_len > sizeof(struct sockaddr))
+ return -EINVAL;
+ return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
+
+ case SIOCSIFHWBROADCAST:
+ if (ifr->ifr_hwaddr.sa_family != dev->type)
+ return -EINVAL;
+ memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
+ min(sizeof(ifr->ifr_hwaddr.sa_data),
+ (size_t)dev->addr_len));
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return 0;
+
+ case SIOCSIFMAP:
+ if (ops->ndo_set_config) {
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return ops->ndo_set_config(dev, &ifr->ifr_map);
+ }
+ return -EOPNOTSUPP;
+
+ case SIOCADDMULTI:
+ if (!ops->ndo_set_rx_mode ||
+ ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
+ return -EINVAL;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
+
+ case SIOCDELMULTI:
+ if (!ops->ndo_set_rx_mode ||
+ ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
+ return -EINVAL;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
+
+ case SIOCSIFTXQLEN:
+ if (ifr->ifr_qlen < 0)
+ return -EINVAL;
+ return dev_change_tx_queue_len(dev, ifr->ifr_qlen);
+
+ case SIOCSIFNAME:
+ ifr->ifr_newname[IFNAMSIZ-1] = '\0';
+ return dev_change_name(dev, ifr->ifr_newname);
+
+ case SIOCSHWTSTAMP:
+ err = net_hwtstamp_validate(ifr);
+ if (err)
+ return err;
+ /* fall through */
+
+ /*
+ * Unknown or private ioctl
+ */
+ default:
+ if ((cmd >= SIOCDEVPRIVATE &&
+ cmd <= SIOCDEVPRIVATE + 15) ||
+ cmd == SIOCBONDENSLAVE ||
+ cmd == SIOCBONDRELEASE ||
+ cmd == SIOCBONDSETHWADDR ||
+ cmd == SIOCBONDSLAVEINFOQUERY ||
+ cmd == SIOCBONDINFOQUERY ||
+ cmd == SIOCBONDCHANGEACTIVE ||
+ cmd == SIOCGMIIPHY ||
+ cmd == SIOCGMIIREG ||
+ cmd == SIOCSMIIREG ||
+ cmd == SIOCBRADDIF ||
+ cmd == SIOCBRDELIF ||
+ cmd == SIOCSHWTSTAMP ||
+ cmd == SIOCGHWTSTAMP ||
+ cmd == SIOCWANDEV) {
+ err = -EOPNOTSUPP;
+ if (ops->ndo_do_ioctl) {
+ if (netif_device_present(dev))
+ err = ops->ndo_do_ioctl(dev, ifr, cmd);
+ else
+ err = -ENODEV;
+ }
+ } else
+ err = -EINVAL;
+
+ }
+ return err;
+}
+
+/**
+ * dev_load - load a network module
+ * @net: the applicable net namespace
+ * @name: name of interface
+ *
+ * If a network interface is not present and the process has suitable
+ * privileges this function loads the module. If module loading is not
+ * available in this kernel then it becomes a nop.
+ */
+
+void dev_load(struct net *net, const char *name)
+{
+ struct net_device *dev;
+ int no_module;
+
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(net, name);
+ rcu_read_unlock();
+
+ no_module = !dev;
+ if (no_module && capable(CAP_NET_ADMIN))
+ no_module = request_module("netdev-%s", name);
+ if (no_module && capable(CAP_SYS_MODULE))
+ request_module("%s", name);
+}
+EXPORT_SYMBOL(dev_load);
+
+/*
+ * This function handles all "interface"-type I/O control requests. The actual
+ * 'doing' part of this is dev_ifsioc above.
+ */
+
+/**
+ * dev_ioctl - network device ioctl
+ * @net: the applicable net namespace
+ * @cmd: command to issue
+ * @arg: pointer to a struct ifreq in user space
+ *
+ * Issue ioctl functions to devices. This is normally called by the
+ * user space syscall interfaces but can sometimes be useful for
+ * other purposes. The return value is the return from the syscall if
+ * positive or a negative errno code on error.
+ */
+
+int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout)
+{
+ int ret;
+ char *colon;
+
+ if (need_copyout)
+ *need_copyout = true;
+ if (cmd == SIOCGIFNAME)
+ return dev_ifname(net, ifr);
+
+ ifr->ifr_name[IFNAMSIZ-1] = 0;
+
+ colon = strchr(ifr->ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+ /*
+ * See which interface the caller is talking about.
+ */
+
+ switch (cmd) {
+ /*
+ * These ioctl calls:
+ * - can be done by all.
+ * - atomic and do not require locking.
+ * - return a value
+ */
+ case SIOCGIFFLAGS:
+ case SIOCGIFMETRIC:
+ case SIOCGIFMTU:
+ case SIOCGIFHWADDR:
+ case SIOCGIFSLAVE:
+ case SIOCGIFMAP:
+ case SIOCGIFINDEX:
+ case SIOCGIFTXQLEN:
+ dev_load(net, ifr->ifr_name);
+ rcu_read_lock();
+ ret = dev_ifsioc_locked(net, ifr, cmd);
+ rcu_read_unlock();
+ if (colon)
+ *colon = ':';
+ return ret;
+
+ case SIOCETHTOOL:
+ dev_load(net, ifr->ifr_name);
+ rtnl_lock();
+ ret = dev_ethtool(net, ifr);
+ rtnl_unlock();
+ if (colon)
+ *colon = ':';
+ return ret;
+
+ /*
+ * These ioctl calls:
+ * - require superuser power.
+ * - require strict serialization.
+ * - return a value
+ */
+ case SIOCGMIIPHY:
+ case SIOCGMIIREG:
+ case SIOCSIFNAME:
+ dev_load(net, ifr->ifr_name);
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ rtnl_lock();
+ ret = dev_ifsioc(net, ifr, cmd);
+ rtnl_unlock();
+ if (colon)
+ *colon = ':';
+ return ret;
+
+ /*
+ * These ioctl calls:
+ * - require superuser power.
+ * - require strict serialization.
+ * - do not return a value
+ */
+ case SIOCSIFMAP:
+ case SIOCSIFTXQLEN:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ /* fall through */
+ /*
+ * These ioctl calls:
+ * - require local superuser power.
+ * - require strict serialization.
+ * - do not return a value
+ */
+ case SIOCSIFFLAGS:
+ case SIOCSIFMETRIC:
+ case SIOCSIFMTU:
+ case SIOCSIFHWADDR:
+ case SIOCSIFSLAVE:
+ case SIOCADDMULTI:
+ case SIOCDELMULTI:
+ case SIOCSIFHWBROADCAST:
+ case SIOCSMIIREG:
+ case SIOCBONDENSLAVE:
+ case SIOCBONDRELEASE:
+ case SIOCBONDSETHWADDR:
+ case SIOCBONDCHANGEACTIVE:
+ case SIOCBRADDIF:
+ case SIOCBRDELIF:
+ case SIOCSHWTSTAMP:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ /* fall through */
+ case SIOCBONDSLAVEINFOQUERY:
+ case SIOCBONDINFOQUERY:
+ dev_load(net, ifr->ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(net, ifr, cmd);
+ rtnl_unlock();
+ if (need_copyout)
+ *need_copyout = false;
+ return ret;
+
+ case SIOCGIFMEM:
+ /* Get the per device memory space. We can add this but
+ * currently do not support it */
+ case SIOCSIFMEM:
+ /* Set the per device memory buffer space.
+ * Not applicable in our case */
+ case SIOCSIFLINK:
+ return -ENOTTY;
+
+ /*
+ * Unknown or private ioctl.
+ */
+ default:
+ if (cmd == SIOCWANDEV ||
+ cmd == SIOCGHWTSTAMP ||
+ (cmd >= SIOCDEVPRIVATE &&
+ cmd <= SIOCDEVPRIVATE + 15)) {
+ dev_load(net, ifr->ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(net, ifr, cmd);
+ rtnl_unlock();
+ return ret;
+ }
+ return -ENOTTY;
+ }
+}
diff --git a/net/core/devlink.c b/net/core/devlink.c
new file mode 100644
index 000000000..6ad095264
--- /dev/null
+++ b/net/core/devlink.c
@@ -0,0 +1,4816 @@
+/*
+ * net/core/devlink.c - Network physical/parent device Netlink interface
+ *
+ * Heavily inspired by net/wireless/
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <rdma/ib_verbs.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/devlink.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/devlink.h>
+
+static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = {
+ {
+ .name = "destination mac",
+ .id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC,
+ .bitwidth = 48,
+ },
+};
+
+struct devlink_dpipe_header devlink_dpipe_header_ethernet = {
+ .name = "ethernet",
+ .id = DEVLINK_DPIPE_HEADER_ETHERNET,
+ .fields = devlink_dpipe_fields_ethernet,
+ .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ethernet),
+ .global = true,
+};
+EXPORT_SYMBOL(devlink_dpipe_header_ethernet);
+
+static struct devlink_dpipe_field devlink_dpipe_fields_ipv4[] = {
+ {
+ .name = "destination ip",
+ .id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP,
+ .bitwidth = 32,
+ },
+};
+
+struct devlink_dpipe_header devlink_dpipe_header_ipv4 = {
+ .name = "ipv4",
+ .id = DEVLINK_DPIPE_HEADER_IPV4,
+ .fields = devlink_dpipe_fields_ipv4,
+ .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv4),
+ .global = true,
+};
+EXPORT_SYMBOL(devlink_dpipe_header_ipv4);
+
+static struct devlink_dpipe_field devlink_dpipe_fields_ipv6[] = {
+ {
+ .name = "destination ip",
+ .id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP,
+ .bitwidth = 128,
+ },
+};
+
+struct devlink_dpipe_header devlink_dpipe_header_ipv6 = {
+ .name = "ipv6",
+ .id = DEVLINK_DPIPE_HEADER_IPV6,
+ .fields = devlink_dpipe_fields_ipv6,
+ .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv6),
+ .global = true,
+};
+EXPORT_SYMBOL(devlink_dpipe_header_ipv6);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
+
+static LIST_HEAD(devlink_list);
+
+/* devlink_mutex
+ *
+ * An overall lock guarding every operation coming from userspace.
+ * It also guards devlink devices list and it is taken when
+ * driver registers/unregisters it.
+ */
+static DEFINE_MUTEX(devlink_mutex);
+
+static struct net *devlink_net(const struct devlink *devlink)
+{
+ return read_pnet(&devlink->_net);
+}
+
+static void devlink_net_set(struct devlink *devlink, struct net *net)
+{
+ write_pnet(&devlink->_net, net);
+}
+
+static struct devlink *devlink_get_from_attrs(struct net *net,
+ struct nlattr **attrs)
+{
+ struct devlink *devlink;
+ char *busname;
+ char *devname;
+
+ if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME])
+ return ERR_PTR(-EINVAL);
+
+ busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]);
+ devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]);
+
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (strcmp(devlink->dev->bus->name, busname) == 0 &&
+ strcmp(dev_name(devlink->dev), devname) == 0 &&
+ net_eq(devlink_net(devlink), net))
+ return devlink;
+ }
+
+ return ERR_PTR(-ENODEV);
+}
+
+static struct devlink *devlink_get_from_info(struct genl_info *info)
+{
+ return devlink_get_from_attrs(genl_info_net(info), info->attrs);
+}
+
+static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
+ int port_index)
+{
+ struct devlink_port *devlink_port;
+
+ list_for_each_entry(devlink_port, &devlink->port_list, list) {
+ if (devlink_port->index == port_index)
+ return devlink_port;
+ }
+ return NULL;
+}
+
+static bool devlink_port_index_exists(struct devlink *devlink, int port_index)
+{
+ return devlink_port_get_by_index(devlink, port_index);
+}
+
+static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
+{
+ if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_by_index(devlink, port_index);
+ if (!devlink_port)
+ return ERR_PTR(-ENODEV);
+ return devlink_port;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_port_get_from_attrs(devlink, info->attrs);
+}
+
+struct devlink_sb {
+ struct list_head list;
+ unsigned int index;
+ u32 size;
+ u16 ingress_pools_count;
+ u16 egress_pools_count;
+ u16 ingress_tc_count;
+ u16 egress_tc_count;
+};
+
+static u16 devlink_sb_pool_count(struct devlink_sb *devlink_sb)
+{
+ return devlink_sb->ingress_pools_count + devlink_sb->egress_pools_count;
+}
+
+static struct devlink_sb *devlink_sb_get_by_index(struct devlink *devlink,
+ unsigned int sb_index)
+{
+ struct devlink_sb *devlink_sb;
+
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ if (devlink_sb->index == sb_index)
+ return devlink_sb;
+ }
+ return NULL;
+}
+
+static bool devlink_sb_index_exists(struct devlink *devlink,
+ unsigned int sb_index)
+{
+ return devlink_sb_get_by_index(devlink, sb_index);
+}
+
+static struct devlink_sb *devlink_sb_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
+{
+ if (attrs[DEVLINK_ATTR_SB_INDEX]) {
+ u32 sb_index = nla_get_u32(attrs[DEVLINK_ATTR_SB_INDEX]);
+ struct devlink_sb *devlink_sb;
+
+ devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
+ if (!devlink_sb)
+ return ERR_PTR(-ENODEV);
+ return devlink_sb;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_sb *devlink_sb_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_sb_get_from_attrs(devlink, info->attrs);
+}
+
+static int devlink_sb_pool_index_get_from_attrs(struct devlink_sb *devlink_sb,
+ struct nlattr **attrs,
+ u16 *p_pool_index)
+{
+ u16 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_POOL_INDEX])
+ return -EINVAL;
+
+ val = nla_get_u16(attrs[DEVLINK_ATTR_SB_POOL_INDEX]);
+ if (val >= devlink_sb_pool_count(devlink_sb))
+ return -EINVAL;
+ *p_pool_index = val;
+ return 0;
+}
+
+static int devlink_sb_pool_index_get_from_info(struct devlink_sb *devlink_sb,
+ struct genl_info *info,
+ u16 *p_pool_index)
+{
+ return devlink_sb_pool_index_get_from_attrs(devlink_sb, info->attrs,
+ p_pool_index);
+}
+
+static int
+devlink_sb_pool_type_get_from_attrs(struct nlattr **attrs,
+ enum devlink_sb_pool_type *p_pool_type)
+{
+ u8 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_POOL_TYPE])
+ return -EINVAL;
+
+ val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_TYPE]);
+ if (val != DEVLINK_SB_POOL_TYPE_INGRESS &&
+ val != DEVLINK_SB_POOL_TYPE_EGRESS)
+ return -EINVAL;
+ *p_pool_type = val;
+ return 0;
+}
+
+static int
+devlink_sb_pool_type_get_from_info(struct genl_info *info,
+ enum devlink_sb_pool_type *p_pool_type)
+{
+ return devlink_sb_pool_type_get_from_attrs(info->attrs, p_pool_type);
+}
+
+static int
+devlink_sb_th_type_get_from_attrs(struct nlattr **attrs,
+ enum devlink_sb_threshold_type *p_th_type)
+{
+ u8 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE])
+ return -EINVAL;
+
+ val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]);
+ if (val != DEVLINK_SB_THRESHOLD_TYPE_STATIC &&
+ val != DEVLINK_SB_THRESHOLD_TYPE_DYNAMIC)
+ return -EINVAL;
+ *p_th_type = val;
+ return 0;
+}
+
+static int
+devlink_sb_th_type_get_from_info(struct genl_info *info,
+ enum devlink_sb_threshold_type *p_th_type)
+{
+ return devlink_sb_th_type_get_from_attrs(info->attrs, p_th_type);
+}
+
+static int
+devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb,
+ struct nlattr **attrs,
+ enum devlink_sb_pool_type pool_type,
+ u16 *p_tc_index)
+{
+ u16 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_TC_INDEX])
+ return -EINVAL;
+
+ val = nla_get_u16(attrs[DEVLINK_ATTR_SB_TC_INDEX]);
+ if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS &&
+ val >= devlink_sb->ingress_tc_count)
+ return -EINVAL;
+ if (pool_type == DEVLINK_SB_POOL_TYPE_EGRESS &&
+ val >= devlink_sb->egress_tc_count)
+ return -EINVAL;
+ *p_tc_index = val;
+ return 0;
+}
+
+static int
+devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
+ struct genl_info *info,
+ enum devlink_sb_pool_type pool_type,
+ u16 *p_tc_index)
+{
+ return devlink_sb_tc_index_get_from_attrs(devlink_sb, info->attrs,
+ pool_type, p_tc_index);
+}
+
+struct devlink_region {
+ struct devlink *devlink;
+ struct list_head list;
+ const char *name;
+ struct list_head snapshot_list;
+ u32 max_snapshots;
+ u32 cur_snapshots;
+ u64 size;
+};
+
+struct devlink_snapshot {
+ struct list_head list;
+ struct devlink_region *region;
+ devlink_snapshot_data_dest_t *data_destructor;
+ u64 data_len;
+ u8 *data;
+ u32 id;
+};
+
+static struct devlink_region *
+devlink_region_get_by_name(struct devlink *devlink, const char *region_name)
+{
+ struct devlink_region *region;
+
+ list_for_each_entry(region, &devlink->region_list, list)
+ if (!strcmp(region->name, region_name))
+ return region;
+
+ return NULL;
+}
+
+static struct devlink_snapshot *
+devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
+{
+ struct devlink_snapshot *snapshot;
+
+ list_for_each_entry(snapshot, &region->snapshot_list, list)
+ if (snapshot->id == id)
+ return snapshot;
+
+ return NULL;
+}
+
+static void devlink_region_snapshot_del(struct devlink_snapshot *snapshot)
+{
+ snapshot->region->cur_snapshots--;
+ list_del(&snapshot->list);
+ (*snapshot->data_destructor)(snapshot->data);
+ kfree(snapshot);
+}
+
+#define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0)
+#define DEVLINK_NL_FLAG_NEED_PORT BIT(1)
+#define DEVLINK_NL_FLAG_NEED_SB BIT(2)
+
+/* The per devlink instance lock is taken by default in the pre-doit
+ * operation, yet several commands do not require this. The global
+ * devlink lock is taken and protects from disruption by user-calls.
+ */
+#define DEVLINK_NL_FLAG_NO_LOCK BIT(3)
+
+static int devlink_nl_pre_doit(const struct genl_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ devlink = devlink_get_from_info(info);
+ if (IS_ERR(devlink)) {
+ mutex_unlock(&devlink_mutex);
+ return PTR_ERR(devlink);
+ }
+ if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
+ mutex_lock(&devlink->lock);
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) {
+ info->user_ptr[0] = devlink;
+ } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_from_info(devlink, info);
+ if (IS_ERR(devlink_port)) {
+ err = PTR_ERR(devlink_port);
+ goto unlock;
+ }
+ info->user_ptr[0] = devlink_port;
+ }
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) {
+ struct devlink_sb *devlink_sb;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb)) {
+ err = PTR_ERR(devlink_sb);
+ goto unlock;
+ }
+ info->user_ptr[1] = devlink_sb;
+ }
+ return 0;
+
+unlock:
+ if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
+ mutex_unlock(&devlink->lock);
+ mutex_unlock(&devlink_mutex);
+ return err;
+}
+
+static void devlink_nl_post_doit(const struct genl_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink;
+
+ devlink = devlink_get_from_info(info);
+ if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
+ mutex_unlock(&devlink->lock);
+ mutex_unlock(&devlink_mutex);
+}
+
+static struct genl_family devlink_nl_family;
+
+enum devlink_multicast_groups {
+ DEVLINK_MCGRP_CONFIG,
+};
+
+static const struct genl_multicast_group devlink_nl_mcgrps[] = {
+ [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME },
+};
+
+static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
+{
+ if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name))
+ return -EMSGSIZE;
+ if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev)))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_port_attrs_put(struct sk_buff *msg,
+ struct devlink_port *devlink_port)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ if (!attrs->set)
+ return 0;
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
+ return -EMSGSIZE;
+ if (!attrs->split)
+ return 0;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
+ attrs->split_subport_number))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
+ goto nla_put_failure;
+ if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET &&
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE,
+ devlink_port->desired_type))
+ goto nla_put_failure;
+ if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
+ struct net_device *netdev = devlink_port->type_dev;
+
+ if (netdev &&
+ (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
+ netdev->ifindex) ||
+ nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
+ netdev->name)))
+ goto nla_put_failure;
+ }
+ if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
+ struct ib_device *ibdev = devlink_port->type_dev;
+
+ if (ibdev &&
+ nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME,
+ ibdev->name))
+ goto nla_put_failure;
+ }
+ if (devlink_nl_port_attrs_put(msg, devlink_port))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_port_notify(struct devlink_port *devlink_port,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = devlink_port->devlink;
+ struct sk_buff *msg;
+ int err;
+
+ if (!devlink_port->registered)
+ return;
+
+ WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI);
+ if (err)
+ goto out;
+ idx++;
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink *devlink = devlink_port->devlink;
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_port_fill(msg, devlink, devlink_port,
+ DEVLINK_CMD_PORT_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_port *devlink_port;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(devlink_port, &devlink->port_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_port_fill(msg, devlink, devlink_port,
+ DEVLINK_CMD_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_port_type_set(struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ enum devlink_port_type port_type)
+
+{
+ int err;
+
+ if (devlink->ops && devlink->ops->port_type_set) {
+ if (port_type == DEVLINK_PORT_TYPE_NOTSET)
+ return -EINVAL;
+ if (port_type == devlink_port->type)
+ return 0;
+ err = devlink->ops->port_type_set(devlink_port, port_type);
+ if (err)
+ return err;
+ devlink_port->desired_type = port_type;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ return 0;
+ }
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink *devlink = devlink_port->devlink;
+ int err;
+
+ if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
+ enum devlink_port_type port_type;
+
+ port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
+ err = devlink_port_type_set(devlink, devlink_port, port_type);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int devlink_port_split(struct devlink *devlink, u32 port_index,
+ u32 count, struct netlink_ext_ack *extack)
+
+{
+ if (devlink->ops && devlink->ops->port_split)
+ return devlink->ops->port_split(devlink, port_index, count,
+ extack);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ u32 port_index;
+ u32 count;
+
+ if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] ||
+ !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT])
+ return -EINVAL;
+
+ port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+ count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
+ return devlink_port_split(devlink, port_index, count, info->extack);
+}
+
+static int devlink_port_unsplit(struct devlink *devlink, u32 port_index,
+ struct netlink_ext_ack *extack)
+
+{
+ if (devlink->ops && devlink->ops->port_unsplit)
+ return devlink->ops->port_unsplit(devlink, port_index, extack);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ u32 port_index;
+
+ if (!info->attrs[DEVLINK_ATTR_PORT_INDEX])
+ return -EINVAL;
+
+ port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+ return devlink_port_unsplit(devlink, port_index, info->extack);
+}
+
+static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_SIZE, devlink_sb->size))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_POOL_COUNT,
+ devlink_sb->ingress_pools_count))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_POOL_COUNT,
+ devlink_sb->egress_pools_count))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_TC_COUNT,
+ devlink_sb->ingress_tc_count))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_TC_COUNT,
+ devlink_sb->egress_tc_count))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_fill(msg, devlink, devlink_sb,
+ DEVLINK_CMD_SB_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_sb *devlink_sb;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_sb_fill(msg, devlink, devlink_sb,
+ DEVLINK_CMD_SB_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u16 pool_index, enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ struct devlink_sb_pool_info pool_info;
+ void *hdr;
+ int err;
+
+ err = devlink->ops->sb_pool_get(devlink, devlink_sb->index,
+ pool_index, &pool_info);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_info.pool_type))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_SIZE, pool_info.size))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE,
+ pool_info.threshold_type))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_sb_pool_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct sk_buff *msg;
+ u16 pool_index;
+ int err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!devlink->ops || !devlink->ops->sb_pool_get)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_pool_fill(msg, devlink, devlink_sb, pool_index,
+ DEVLINK_CMD_SB_POOL_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx,
+ struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u32 portid, u32 seq)
+{
+ u16 pool_count = devlink_sb_pool_count(devlink_sb);
+ u16 pool_index;
+ int err;
+
+ for (pool_index = 0; pool_index < pool_count; pool_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_pool_fill(msg, devlink,
+ devlink_sb,
+ pool_index,
+ DEVLINK_CMD_SB_POOL_NEW,
+ portid, seq, NLM_F_MULTI);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ return 0;
+}
+
+static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_sb *devlink_sb;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
+ !devlink->ops || !devlink->ops->sb_pool_get)
+ continue;
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ err = __sb_pool_get_dumpit(msg, start, &idx, devlink,
+ devlink_sb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq);
+ if (err && err != -EOPNOTSUPP) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index,
+ u16 pool_index, u32 size,
+ enum devlink_sb_threshold_type threshold_type)
+
+{
+ const struct devlink_ops *ops = devlink->ops;
+
+ if (ops && ops->sb_pool_set)
+ return ops->sb_pool_set(devlink, sb_index, pool_index,
+ size, threshold_type);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ enum devlink_sb_threshold_type threshold_type;
+ u16 pool_index;
+ u32 size;
+ int err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ err = devlink_sb_th_type_get_from_info(info, &threshold_type);
+ if (err)
+ return err;
+
+ if (!info->attrs[DEVLINK_ATTR_SB_POOL_SIZE])
+ return -EINVAL;
+
+ size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]);
+ return devlink_sb_pool_set(devlink, devlink_sb->index,
+ pool_index, size, threshold_type);
+}
+
+static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ struct devlink_sb *devlink_sb,
+ u16 pool_index,
+ enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ u32 threshold;
+ void *hdr;
+ int err;
+
+ err = ops->sb_port_pool_get(devlink_port, devlink_sb->index,
+ pool_index, &threshold);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold))
+ goto nla_put_failure;
+
+ if (ops->sb_occ_port_pool_get) {
+ u32 cur;
+ u32 max;
+
+ err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index,
+ pool_index, &cur, &max);
+ if (err && err != -EOPNOTSUPP)
+ goto sb_occ_get_failure;
+ if (!err) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max))
+ goto nla_put_failure;
+ }
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ err = -EMSGSIZE;
+sb_occ_get_failure:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct sk_buff *msg;
+ u16 pool_index;
+ int err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!devlink->ops || !devlink->ops->sb_port_pool_get)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_port_pool_fill(msg, devlink, devlink_port,
+ devlink_sb, pool_index,
+ DEVLINK_CMD_SB_PORT_POOL_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx,
+ struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u32 portid, u32 seq)
+{
+ struct devlink_port *devlink_port;
+ u16 pool_count = devlink_sb_pool_count(devlink_sb);
+ u16 pool_index;
+ int err;
+
+ list_for_each_entry(devlink_port, &devlink->port_list, list) {
+ for (pool_index = 0; pool_index < pool_count; pool_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_port_pool_fill(msg, devlink,
+ devlink_port,
+ devlink_sb,
+ pool_index,
+ DEVLINK_CMD_SB_PORT_POOL_NEW,
+ portid, seq,
+ NLM_F_MULTI);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ }
+ return 0;
+}
+
+static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_sb *devlink_sb;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
+ !devlink->ops || !devlink->ops->sb_port_pool_get)
+ continue;
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ err = __sb_port_pool_get_dumpit(msg, start, &idx,
+ devlink, devlink_sb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq);
+ if (err && err != -EOPNOTSUPP) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_sb_port_pool_set(struct devlink_port *devlink_port,
+ unsigned int sb_index, u16 pool_index,
+ u32 threshold)
+
+{
+ const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+ if (ops && ops->sb_port_pool_set)
+ return ops->sb_port_pool_set(devlink_port, sb_index,
+ pool_index, threshold);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ u16 pool_index;
+ u32 threshold;
+ int err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD])
+ return -EINVAL;
+
+ threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
+ return devlink_sb_port_pool_set(devlink_port, devlink_sb->index,
+ pool_index, threshold);
+}
+
+static int
+devlink_nl_sb_tc_pool_bind_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ struct devlink_sb *devlink_sb, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ u16 pool_index;
+ u32 threshold;
+ void *hdr;
+ int err;
+
+ err = ops->sb_tc_pool_bind_get(devlink_port, devlink_sb->index,
+ tc_index, pool_type,
+ &pool_index, &threshold);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_TC_INDEX, tc_index))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_type))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold))
+ goto nla_put_failure;
+
+ if (ops->sb_occ_tc_port_bind_get) {
+ u32 cur;
+ u32 max;
+
+ err = ops->sb_occ_tc_port_bind_get(devlink_port,
+ devlink_sb->index,
+ tc_index, pool_type,
+ &cur, &max);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ if (!err) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max))
+ goto nla_put_failure;
+ }
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_sb_tc_pool_bind_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct sk_buff *msg;
+ enum devlink_sb_pool_type pool_type;
+ u16 tc_index;
+ int err;
+
+ err = devlink_sb_pool_type_get_from_info(info, &pool_type);
+ if (err)
+ return err;
+
+ err = devlink_sb_tc_index_get_from_info(devlink_sb, info,
+ pool_type, &tc_index);
+ if (err)
+ return err;
+
+ if (!devlink->ops || !devlink->ops->sb_tc_pool_bind_get)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, devlink_port,
+ devlink_sb, tc_index, pool_type,
+ DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
+ info->snd_portid,
+ info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
+ int start, int *p_idx,
+ struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u32 portid, u32 seq)
+{
+ struct devlink_port *devlink_port;
+ u16 tc_index;
+ int err;
+
+ list_for_each_entry(devlink_port, &devlink->port_list, list) {
+ for (tc_index = 0;
+ tc_index < devlink_sb->ingress_tc_count; tc_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink,
+ devlink_port,
+ devlink_sb,
+ tc_index,
+ DEVLINK_SB_POOL_TYPE_INGRESS,
+ DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
+ portid, seq,
+ NLM_F_MULTI);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ for (tc_index = 0;
+ tc_index < devlink_sb->egress_tc_count; tc_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink,
+ devlink_port,
+ devlink_sb,
+ tc_index,
+ DEVLINK_SB_POOL_TYPE_EGRESS,
+ DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
+ portid, seq,
+ NLM_F_MULTI);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ }
+ return 0;
+}
+
+static int
+devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_sb *devlink_sb;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
+ !devlink->ops || !devlink->ops->sb_tc_pool_bind_get)
+ continue;
+
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx,
+ devlink,
+ devlink_sb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq);
+ if (err && err != -EOPNOTSUPP) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port,
+ unsigned int sb_index, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ u16 pool_index, u32 threshold)
+
+{
+ const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+ if (ops && ops->sb_tc_pool_bind_set)
+ return ops->sb_tc_pool_bind_set(devlink_port, sb_index,
+ tc_index, pool_type,
+ pool_index, threshold);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ enum devlink_sb_pool_type pool_type;
+ u16 tc_index;
+ u16 pool_index;
+ u32 threshold;
+ int err;
+
+ err = devlink_sb_pool_type_get_from_info(info, &pool_type);
+ if (err)
+ return err;
+
+ err = devlink_sb_tc_index_get_from_info(devlink_sb, info,
+ pool_type, &tc_index);
+ if (err)
+ return err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD])
+ return -EINVAL;
+
+ threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
+ return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index,
+ tc_index, pool_type,
+ pool_index, threshold);
+}
+
+static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ const struct devlink_ops *ops = devlink->ops;
+
+ if (ops && ops->sb_occ_snapshot)
+ return ops->sb_occ_snapshot(devlink, devlink_sb->index);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ const struct devlink_ops *ops = devlink->ops;
+
+ if (ops && ops->sb_occ_max_clear)
+ return ops->sb_occ_max_clear(devlink, devlink_sb->index);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ u8 inline_mode, encap_mode;
+ void *hdr;
+ int err = 0;
+ u16 mode;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto nla_put_failure;
+
+ if (ops->eswitch_mode_get) {
+ err = ops->eswitch_mode_get(devlink, &mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (ops->eswitch_inline_mode_get) {
+ err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
+ inline_mode);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (ops->eswitch_encap_mode_get) {
+ err = ops->eswitch_encap_mode_get(devlink, &encap_mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, encap_mode);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const struct devlink_ops *ops = devlink->ops;
+ struct sk_buff *msg;
+ int err;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET,
+ info->snd_portid, info->snd_seq, 0);
+
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const struct devlink_ops *ops = devlink->ops;
+ u8 inline_mode, encap_mode;
+ int err = 0;
+ u16 mode;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {
+ if (!ops->eswitch_mode_set)
+ return -EOPNOTSUPP;
+ mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ err = ops->eswitch_mode_set(devlink, mode);
+ if (err)
+ return err;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) {
+ if (!ops->eswitch_inline_mode_set)
+ return -EOPNOTSUPP;
+ inline_mode = nla_get_u8(
+ info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]);
+ err = ops->eswitch_inline_mode_set(devlink, inline_mode);
+ if (err)
+ return err;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]) {
+ if (!ops->eswitch_encap_mode_set)
+ return -EOPNOTSUPP;
+ encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]);
+ err = ops->eswitch_encap_mode_set(devlink, encap_mode);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int devlink_dpipe_match_put(struct sk_buff *skb,
+ struct devlink_dpipe_match *match)
+{
+ struct devlink_dpipe_header *header = match->header;
+ struct devlink_dpipe_field *field = &header->fields[match->field_id];
+ struct nlattr *match_attr;
+
+ match_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_MATCH);
+ if (!match_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_MATCH_TYPE, match->type) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, match->header_index) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) ||
+ nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, match_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, match_attr);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_match_put);
+
+static int devlink_dpipe_matches_put(struct devlink_dpipe_table *table,
+ struct sk_buff *skb)
+{
+ struct nlattr *matches_attr;
+
+ matches_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE_MATCHES);
+ if (!matches_attr)
+ return -EMSGSIZE;
+
+ if (table->table_ops->matches_dump(table->priv, skb))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, matches_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, matches_attr);
+ return -EMSGSIZE;
+}
+
+int devlink_dpipe_action_put(struct sk_buff *skb,
+ struct devlink_dpipe_action *action)
+{
+ struct devlink_dpipe_header *header = action->header;
+ struct devlink_dpipe_field *field = &header->fields[action->field_id];
+ struct nlattr *action_attr;
+
+ action_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_ACTION);
+ if (!action_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_ACTION_TYPE, action->type) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, action->header_index) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) ||
+ nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, action_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, action_attr);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_action_put);
+
+static int devlink_dpipe_actions_put(struct devlink_dpipe_table *table,
+ struct sk_buff *skb)
+{
+ struct nlattr *actions_attr;
+
+ actions_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE_ACTIONS);
+ if (!actions_attr)
+ return -EMSGSIZE;
+
+ if (table->table_ops->actions_dump(table->priv, skb))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, actions_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, actions_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_dpipe_table_put(struct sk_buff *skb,
+ struct devlink_dpipe_table *table)
+{
+ struct nlattr *table_attr;
+ u64 table_size;
+
+ table_size = table->table_ops->size_get(table->priv);
+ table_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE);
+ if (!table_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_TABLE_NAME, table->name) ||
+ nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_SIZE, table_size,
+ DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED,
+ table->counters_enabled))
+ goto nla_put_failure;
+
+ if (table->resource_valid) {
+ if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,
+ table->resource_id, DEVLINK_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,
+ table->resource_units, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+ }
+ if (devlink_dpipe_matches_put(table, skb))
+ goto nla_put_failure;
+
+ if (devlink_dpipe_actions_put(table, skb))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, table_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, table_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_dpipe_send_and_alloc_skb(struct sk_buff **pskb,
+ struct genl_info *info)
+{
+ int err;
+
+ if (*pskb) {
+ err = genlmsg_reply(*pskb, info);
+ if (err)
+ return err;
+ }
+ *pskb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!*pskb)
+ return -ENOMEM;
+ return 0;
+}
+
+static int devlink_dpipe_tables_fill(struct genl_info *info,
+ enum devlink_command cmd, int flags,
+ struct list_head *dpipe_tables,
+ const char *table_name)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_dpipe_table *table;
+ struct nlattr *tables_attr;
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ bool incomplete;
+ void *hdr;
+ int i;
+ int err;
+
+ table = list_first_entry(dpipe_tables,
+ struct devlink_dpipe_table, list);
+start_again:
+ err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+ &devlink_nl_family, NLM_F_MULTI, cmd);
+ if (!hdr) {
+ nlmsg_free(skb);
+ return -EMSGSIZE;
+ }
+
+ if (devlink_nl_put_handle(skb, devlink))
+ goto nla_put_failure;
+ tables_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLES);
+ if (!tables_attr)
+ goto nla_put_failure;
+
+ i = 0;
+ incomplete = false;
+ list_for_each_entry_from(table, dpipe_tables, list) {
+ if (!table_name) {
+ err = devlink_dpipe_table_put(skb, table);
+ if (err) {
+ if (!i)
+ goto err_table_put;
+ incomplete = true;
+ break;
+ }
+ } else {
+ if (!strcmp(table->name, table_name)) {
+ err = devlink_dpipe_table_put(skb, table);
+ if (err)
+ break;
+ }
+ }
+ i++;
+ }
+
+ nla_nest_end(skb, tables_attr);
+ genlmsg_end(skb, hdr);
+ if (incomplete)
+ goto start_again;
+
+send_done:
+ nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+ NLMSG_DONE, 0, flags | NLM_F_MULTI);
+ if (!nlh) {
+ err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+ if (err)
+ return err;
+ goto send_done;
+ }
+
+ return genlmsg_reply(skb, info);
+
+nla_put_failure:
+ err = -EMSGSIZE;
+err_table_put:
+ nlmsg_free(skb);
+ return err;
+}
+
+static int devlink_nl_cmd_dpipe_table_get(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const char *table_name = NULL;
+
+ if (info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME])
+ table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
+
+ return devlink_dpipe_tables_fill(info, DEVLINK_CMD_DPIPE_TABLE_GET, 0,
+ &devlink->dpipe_table_list,
+ table_name);
+}
+
+static int devlink_dpipe_value_put(struct sk_buff *skb,
+ struct devlink_dpipe_value *value)
+{
+ if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE,
+ value->value_size, value->value))
+ return -EMSGSIZE;
+ if (value->mask)
+ if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE_MASK,
+ value->value_size, value->mask))
+ return -EMSGSIZE;
+ if (value->mapping_valid)
+ if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_VALUE_MAPPING,
+ value->mapping_value))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_dpipe_action_value_put(struct sk_buff *skb,
+ struct devlink_dpipe_value *value)
+{
+ if (!value->action)
+ return -EINVAL;
+ if (devlink_dpipe_action_put(skb, value->action))
+ return -EMSGSIZE;
+ if (devlink_dpipe_value_put(skb, value))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_dpipe_action_values_put(struct sk_buff *skb,
+ struct devlink_dpipe_value *values,
+ unsigned int values_count)
+{
+ struct nlattr *action_attr;
+ int i;
+ int err;
+
+ for (i = 0; i < values_count; i++) {
+ action_attr = nla_nest_start(skb,
+ DEVLINK_ATTR_DPIPE_ACTION_VALUE);
+ if (!action_attr)
+ return -EMSGSIZE;
+ err = devlink_dpipe_action_value_put(skb, &values[i]);
+ if (err)
+ goto err_action_value_put;
+ nla_nest_end(skb, action_attr);
+ }
+ return 0;
+
+err_action_value_put:
+ nla_nest_cancel(skb, action_attr);
+ return err;
+}
+
+static int devlink_dpipe_match_value_put(struct sk_buff *skb,
+ struct devlink_dpipe_value *value)
+{
+ if (!value->match)
+ return -EINVAL;
+ if (devlink_dpipe_match_put(skb, value->match))
+ return -EMSGSIZE;
+ if (devlink_dpipe_value_put(skb, value))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_dpipe_match_values_put(struct sk_buff *skb,
+ struct devlink_dpipe_value *values,
+ unsigned int values_count)
+{
+ struct nlattr *match_attr;
+ int i;
+ int err;
+
+ for (i = 0; i < values_count; i++) {
+ match_attr = nla_nest_start(skb,
+ DEVLINK_ATTR_DPIPE_MATCH_VALUE);
+ if (!match_attr)
+ return -EMSGSIZE;
+ err = devlink_dpipe_match_value_put(skb, &values[i]);
+ if (err)
+ goto err_match_value_put;
+ nla_nest_end(skb, match_attr);
+ }
+ return 0;
+
+err_match_value_put:
+ nla_nest_cancel(skb, match_attr);
+ return err;
+}
+
+static int devlink_dpipe_entry_put(struct sk_buff *skb,
+ struct devlink_dpipe_entry *entry)
+{
+ struct nlattr *entry_attr, *matches_attr, *actions_attr;
+ int err;
+
+ entry_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_ENTRY);
+ if (!entry_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_INDEX, entry->index,
+ DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+ if (entry->counter_valid)
+ if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_COUNTER,
+ entry->counter, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ matches_attr = nla_nest_start(skb,
+ DEVLINK_ATTR_DPIPE_ENTRY_MATCH_VALUES);
+ if (!matches_attr)
+ goto nla_put_failure;
+
+ err = devlink_dpipe_match_values_put(skb, entry->match_values,
+ entry->match_values_count);
+ if (err) {
+ nla_nest_cancel(skb, matches_attr);
+ goto err_match_values_put;
+ }
+ nla_nest_end(skb, matches_attr);
+
+ actions_attr = nla_nest_start(skb,
+ DEVLINK_ATTR_DPIPE_ENTRY_ACTION_VALUES);
+ if (!actions_attr)
+ goto nla_put_failure;
+
+ err = devlink_dpipe_action_values_put(skb, entry->action_values,
+ entry->action_values_count);
+ if (err) {
+ nla_nest_cancel(skb, actions_attr);
+ goto err_action_values_put;
+ }
+ nla_nest_end(skb, actions_attr);
+
+ nla_nest_end(skb, entry_attr);
+ return 0;
+
+nla_put_failure:
+ err = -EMSGSIZE;
+err_match_values_put:
+err_action_values_put:
+ nla_nest_cancel(skb, entry_attr);
+ return err;
+}
+
+static struct devlink_dpipe_table *
+devlink_dpipe_table_find(struct list_head *dpipe_tables,
+ const char *table_name)
+{
+ struct devlink_dpipe_table *table;
+
+ list_for_each_entry_rcu(table, dpipe_tables, list) {
+ if (!strcmp(table->name, table_name))
+ return table;
+ }
+ return NULL;
+}
+
+int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+ struct devlink *devlink;
+ int err;
+
+ err = devlink_dpipe_send_and_alloc_skb(&dump_ctx->skb,
+ dump_ctx->info);
+ if (err)
+ return err;
+
+ dump_ctx->hdr = genlmsg_put(dump_ctx->skb,
+ dump_ctx->info->snd_portid,
+ dump_ctx->info->snd_seq,
+ &devlink_nl_family, NLM_F_MULTI,
+ dump_ctx->cmd);
+ if (!dump_ctx->hdr)
+ goto nla_put_failure;
+
+ devlink = dump_ctx->info->user_ptr[0];
+ if (devlink_nl_put_handle(dump_ctx->skb, devlink))
+ goto nla_put_failure;
+ dump_ctx->nest = nla_nest_start(dump_ctx->skb,
+ DEVLINK_ATTR_DPIPE_ENTRIES);
+ if (!dump_ctx->nest)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ nlmsg_free(dump_ctx->skb);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_prepare);
+
+int devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx,
+ struct devlink_dpipe_entry *entry)
+{
+ return devlink_dpipe_entry_put(dump_ctx->skb, entry);
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_append);
+
+int devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+ nla_nest_end(dump_ctx->skb, dump_ctx->nest);
+ genlmsg_end(dump_ctx->skb, dump_ctx->hdr);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_close);
+
+void devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry)
+
+{
+ unsigned int value_count, value_index;
+ struct devlink_dpipe_value *value;
+
+ value = entry->action_values;
+ value_count = entry->action_values_count;
+ for (value_index = 0; value_index < value_count; value_index++) {
+ kfree(value[value_index].value);
+ kfree(value[value_index].mask);
+ }
+
+ value = entry->match_values;
+ value_count = entry->match_values_count;
+ for (value_index = 0; value_index < value_count; value_index++) {
+ kfree(value[value_index].value);
+ kfree(value[value_index].mask);
+ }
+}
+EXPORT_SYMBOL(devlink_dpipe_entry_clear);
+
+static int devlink_dpipe_entries_fill(struct genl_info *info,
+ enum devlink_command cmd, int flags,
+ struct devlink_dpipe_table *table)
+{
+ struct devlink_dpipe_dump_ctx dump_ctx;
+ struct nlmsghdr *nlh;
+ int err;
+
+ dump_ctx.skb = NULL;
+ dump_ctx.cmd = cmd;
+ dump_ctx.info = info;
+
+ err = table->table_ops->entries_dump(table->priv,
+ table->counters_enabled,
+ &dump_ctx);
+ if (err)
+ return err;
+
+send_done:
+ nlh = nlmsg_put(dump_ctx.skb, info->snd_portid, info->snd_seq,
+ NLMSG_DONE, 0, flags | NLM_F_MULTI);
+ if (!nlh) {
+ err = devlink_dpipe_send_and_alloc_skb(&dump_ctx.skb, info);
+ if (err)
+ return err;
+ goto send_done;
+ }
+ return genlmsg_reply(dump_ctx.skb, info);
+}
+
+static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_dpipe_table *table;
+ const char *table_name;
+
+ if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME])
+ return -EINVAL;
+
+ table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
+ table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+ table_name);
+ if (!table)
+ return -EINVAL;
+
+ if (!table->table_ops->entries_dump)
+ return -EINVAL;
+
+ return devlink_dpipe_entries_fill(info, DEVLINK_CMD_DPIPE_ENTRIES_GET,
+ 0, table);
+}
+
+static int devlink_dpipe_fields_put(struct sk_buff *skb,
+ const struct devlink_dpipe_header *header)
+{
+ struct devlink_dpipe_field *field;
+ struct nlattr *field_attr;
+ int i;
+
+ for (i = 0; i < header->fields_count; i++) {
+ field = &header->fields[i];
+ field_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_FIELD);
+ if (!field_attr)
+ return -EMSGSIZE;
+ if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_FIELD_NAME, field->name) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_BITWIDTH, field->bitwidth) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_MAPPING_TYPE, field->mapping_type))
+ goto nla_put_failure;
+ nla_nest_end(skb, field_attr);
+ }
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, field_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_dpipe_header_put(struct sk_buff *skb,
+ struct devlink_dpipe_header *header)
+{
+ struct nlattr *fields_attr, *header_attr;
+ int err;
+
+ header_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADER);
+ if (!header_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_HEADER_NAME, header->name) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) ||
+ nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global))
+ goto nla_put_failure;
+
+ fields_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADER_FIELDS);
+ if (!fields_attr)
+ goto nla_put_failure;
+
+ err = devlink_dpipe_fields_put(skb, header);
+ if (err) {
+ nla_nest_cancel(skb, fields_attr);
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, fields_attr);
+ nla_nest_end(skb, header_attr);
+ return 0;
+
+nla_put_failure:
+ err = -EMSGSIZE;
+ nla_nest_cancel(skb, header_attr);
+ return err;
+}
+
+static int devlink_dpipe_headers_fill(struct genl_info *info,
+ enum devlink_command cmd, int flags,
+ struct devlink_dpipe_headers *
+ dpipe_headers)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct nlattr *headers_attr;
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ void *hdr;
+ int i, j;
+ int err;
+
+ i = 0;
+start_again:
+ err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+ &devlink_nl_family, NLM_F_MULTI, cmd);
+ if (!hdr) {
+ nlmsg_free(skb);
+ return -EMSGSIZE;
+ }
+
+ if (devlink_nl_put_handle(skb, devlink))
+ goto nla_put_failure;
+ headers_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADERS);
+ if (!headers_attr)
+ goto nla_put_failure;
+
+ j = 0;
+ for (; i < dpipe_headers->headers_count; i++) {
+ err = devlink_dpipe_header_put(skb, dpipe_headers->headers[i]);
+ if (err) {
+ if (!j)
+ goto err_table_put;
+ break;
+ }
+ j++;
+ }
+ nla_nest_end(skb, headers_attr);
+ genlmsg_end(skb, hdr);
+ if (i != dpipe_headers->headers_count)
+ goto start_again;
+
+send_done:
+ nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+ NLMSG_DONE, 0, flags | NLM_F_MULTI);
+ if (!nlh) {
+ err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+ if (err)
+ return err;
+ goto send_done;
+ }
+ return genlmsg_reply(skb, info);
+
+nla_put_failure:
+ err = -EMSGSIZE;
+err_table_put:
+ nlmsg_free(skb);
+ return err;
+}
+
+static int devlink_nl_cmd_dpipe_headers_get(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (!devlink->dpipe_headers)
+ return -EOPNOTSUPP;
+ return devlink_dpipe_headers_fill(info, DEVLINK_CMD_DPIPE_HEADERS_GET,
+ 0, devlink->dpipe_headers);
+}
+
+static int devlink_dpipe_table_counters_set(struct devlink *devlink,
+ const char *table_name,
+ bool enable)
+{
+ struct devlink_dpipe_table *table;
+
+ table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+ table_name);
+ if (!table)
+ return -EINVAL;
+
+ if (table->counter_control_extern)
+ return -EOPNOTSUPP;
+
+ if (!(table->counters_enabled ^ enable))
+ return 0;
+
+ table->counters_enabled = enable;
+ if (table->table_ops->counters_set_update)
+ table->table_ops->counters_set_update(table->priv, enable);
+ return 0;
+}
+
+static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const char *table_name;
+ bool counters_enable;
+
+ if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME] ||
+ !info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED])
+ return -EINVAL;
+
+ table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
+ counters_enable = !!nla_get_u8(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]);
+
+ return devlink_dpipe_table_counters_set(devlink, table_name,
+ counters_enable);
+}
+
+static struct devlink_resource *
+devlink_resource_find(struct devlink *devlink,
+ struct devlink_resource *resource, u64 resource_id)
+{
+ struct list_head *resource_list;
+
+ if (resource)
+ resource_list = &resource->resource_list;
+ else
+ resource_list = &devlink->resource_list;
+
+ list_for_each_entry(resource, resource_list, list) {
+ struct devlink_resource *child_resource;
+
+ if (resource->id == resource_id)
+ return resource;
+
+ child_resource = devlink_resource_find(devlink, resource,
+ resource_id);
+ if (child_resource)
+ return child_resource;
+ }
+ return NULL;
+}
+
+static void
+devlink_resource_validate_children(struct devlink_resource *resource)
+{
+ struct devlink_resource *child_resource;
+ bool size_valid = true;
+ u64 parts_size = 0;
+
+ if (list_empty(&resource->resource_list))
+ goto out;
+
+ list_for_each_entry(child_resource, &resource->resource_list, list)
+ parts_size += child_resource->size_new;
+
+ if (parts_size > resource->size_new)
+ size_valid = false;
+out:
+ resource->size_valid = size_valid;
+}
+
+static int
+devlink_resource_validate_size(struct devlink_resource *resource, u64 size,
+ struct netlink_ext_ack *extack)
+{
+ u64 reminder;
+ int err = 0;
+
+ if (size > resource->size_params.size_max) {
+ NL_SET_ERR_MSG_MOD(extack, "Size larger than maximum");
+ err = -EINVAL;
+ }
+
+ if (size < resource->size_params.size_min) {
+ NL_SET_ERR_MSG_MOD(extack, "Size smaller than minimum");
+ err = -EINVAL;
+ }
+
+ div64_u64_rem(size, resource->size_params.size_granularity, &reminder);
+ if (reminder) {
+ NL_SET_ERR_MSG_MOD(extack, "Wrong granularity");
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_resource *resource;
+ u64 resource_id;
+ u64 size;
+ int err;
+
+ if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] ||
+ !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE])
+ return -EINVAL;
+ resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]);
+
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (!resource)
+ return -EINVAL;
+
+ size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]);
+ err = devlink_resource_validate_size(resource, size, info->extack);
+ if (err)
+ return err;
+
+ resource->size_new = size;
+ devlink_resource_validate_children(resource);
+ if (resource->parent)
+ devlink_resource_validate_children(resource->parent);
+ return 0;
+}
+
+static int
+devlink_resource_size_params_put(struct devlink_resource *resource,
+ struct sk_buff *skb)
+{
+ struct devlink_resource_size_params *size_params;
+
+ size_params = &resource->size_params;
+ if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN,
+ size_params->size_granularity, DEVLINK_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX,
+ size_params->size_max, DEVLINK_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN,
+ size_params->size_min, DEVLINK_ATTR_PAD) ||
+ nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_resource_occ_put(struct devlink_resource *resource,
+ struct sk_buff *skb)
+{
+ if (!resource->occ_get)
+ return 0;
+ return nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
+ resource->occ_get(resource->occ_get_priv),
+ DEVLINK_ATTR_PAD);
+}
+
+static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
+ struct devlink_resource *resource)
+{
+ struct devlink_resource *child_resource;
+ struct nlattr *child_resource_attr;
+ struct nlattr *resource_attr;
+
+ resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE);
+ if (!resource_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) ||
+ nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size,
+ DEVLINK_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id,
+ DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+ if (resource->size != resource->size_new)
+ nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
+ resource->size_new, DEVLINK_ATTR_PAD);
+ if (devlink_resource_occ_put(resource, skb))
+ goto nla_put_failure;
+ if (devlink_resource_size_params_put(resource, skb))
+ goto nla_put_failure;
+ if (list_empty(&resource->resource_list))
+ goto out;
+
+ if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID,
+ resource->size_valid))
+ goto nla_put_failure;
+
+ child_resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST);
+ if (!child_resource_attr)
+ goto nla_put_failure;
+
+ list_for_each_entry(child_resource, &resource->resource_list, list) {
+ if (devlink_resource_put(devlink, skb, child_resource))
+ goto resource_put_failure;
+ }
+
+ nla_nest_end(skb, child_resource_attr);
+out:
+ nla_nest_end(skb, resource_attr);
+ return 0;
+
+resource_put_failure:
+ nla_nest_cancel(skb, child_resource_attr);
+nla_put_failure:
+ nla_nest_cancel(skb, resource_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_resource_fill(struct genl_info *info,
+ enum devlink_command cmd, int flags)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_resource *resource;
+ struct nlattr *resources_attr;
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ bool incomplete;
+ void *hdr;
+ int i;
+ int err;
+
+ resource = list_first_entry(&devlink->resource_list,
+ struct devlink_resource, list);
+start_again:
+ err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+ &devlink_nl_family, NLM_F_MULTI, cmd);
+ if (!hdr) {
+ nlmsg_free(skb);
+ return -EMSGSIZE;
+ }
+
+ if (devlink_nl_put_handle(skb, devlink))
+ goto nla_put_failure;
+
+ resources_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST);
+ if (!resources_attr)
+ goto nla_put_failure;
+
+ incomplete = false;
+ i = 0;
+ list_for_each_entry_from(resource, &devlink->resource_list, list) {
+ err = devlink_resource_put(devlink, skb, resource);
+ if (err) {
+ if (!i)
+ goto err_resource_put;
+ incomplete = true;
+ break;
+ }
+ i++;
+ }
+ nla_nest_end(skb, resources_attr);
+ genlmsg_end(skb, hdr);
+ if (incomplete)
+ goto start_again;
+send_done:
+ nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+ NLMSG_DONE, 0, flags | NLM_F_MULTI);
+ if (!nlh) {
+ err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+ if (err)
+ return err;
+ goto send_done;
+ }
+ return genlmsg_reply(skb, info);
+
+nla_put_failure:
+ err = -EMSGSIZE;
+err_resource_put:
+ nlmsg_free(skb);
+ return err;
+}
+
+static int devlink_nl_cmd_resource_dump(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (list_empty(&devlink->resource_list))
+ return -EOPNOTSUPP;
+
+ return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
+}
+
+static int
+devlink_resources_validate(struct devlink *devlink,
+ struct devlink_resource *resource,
+ struct genl_info *info)
+{
+ struct list_head *resource_list;
+ int err = 0;
+
+ if (resource)
+ resource_list = &resource->resource_list;
+ else
+ resource_list = &devlink->resource_list;
+
+ list_for_each_entry(resource, resource_list, list) {
+ if (!resource->size_valid)
+ return -EINVAL;
+ err = devlink_resources_validate(devlink, resource, info);
+ if (err)
+ return err;
+ }
+ return err;
+}
+
+static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ int err;
+
+ if (!devlink->ops->reload)
+ return -EOPNOTSUPP;
+
+ err = devlink_resources_validate(devlink, NULL, info);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
+ return err;
+ }
+ return devlink->ops->reload(devlink, info->extack);
+}
+
+static const struct devlink_param devlink_param_generic[] = {
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET,
+ .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME,
+ .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS,
+ .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME,
+ .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
+ .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME,
+ .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE,
+ },
+};
+
+static int devlink_param_generic_verify(const struct devlink_param *param)
+{
+ /* verify it match generic parameter by id and name */
+ if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX)
+ return -EINVAL;
+ if (strcmp(param->name, devlink_param_generic[param->id].name))
+ return -ENOENT;
+
+ WARN_ON(param->type != devlink_param_generic[param->id].type);
+
+ return 0;
+}
+
+static int devlink_param_driver_verify(const struct devlink_param *param)
+{
+ int i;
+
+ if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX)
+ return -EINVAL;
+ /* verify no such name in generic params */
+ for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++)
+ if (!strcmp(param->name, devlink_param_generic[i].name))
+ return -EEXIST;
+
+ return 0;
+}
+
+static struct devlink_param_item *
+devlink_param_find_by_name(struct list_head *param_list,
+ const char *param_name)
+{
+ struct devlink_param_item *param_item;
+
+ list_for_each_entry(param_item, param_list, list)
+ if (!strcmp(param_item->param->name, param_name))
+ return param_item;
+ return NULL;
+}
+
+static struct devlink_param_item *
+devlink_param_find_by_id(struct list_head *param_list, u32 param_id)
+{
+ struct devlink_param_item *param_item;
+
+ list_for_each_entry(param_item, param_list, list)
+ if (param_item->param->id == param_id)
+ return param_item;
+ return NULL;
+}
+
+static bool
+devlink_param_cmode_is_supported(const struct devlink_param *param,
+ enum devlink_param_cmode cmode)
+{
+ return test_bit(cmode, &param->supported_cmodes);
+}
+
+static int devlink_param_get(struct devlink *devlink,
+ const struct devlink_param *param,
+ struct devlink_param_gset_ctx *ctx)
+{
+ if (!param->get)
+ return -EOPNOTSUPP;
+ return param->get(devlink, param->id, ctx);
+}
+
+static int devlink_param_set(struct devlink *devlink,
+ const struct devlink_param *param,
+ struct devlink_param_gset_ctx *ctx)
+{
+ if (!param->set)
+ return -EOPNOTSUPP;
+ return param->set(devlink, param->id, ctx);
+}
+
+static int
+devlink_param_type_to_nla_type(enum devlink_param_type param_type)
+{
+ switch (param_type) {
+ case DEVLINK_PARAM_TYPE_U8:
+ return NLA_U8;
+ case DEVLINK_PARAM_TYPE_U16:
+ return NLA_U16;
+ case DEVLINK_PARAM_TYPE_U32:
+ return NLA_U32;
+ case DEVLINK_PARAM_TYPE_STRING:
+ return NLA_STRING;
+ case DEVLINK_PARAM_TYPE_BOOL:
+ return NLA_FLAG;
+ default:
+ return -EINVAL;
+ }
+}
+
+static int
+devlink_nl_param_value_fill_one(struct sk_buff *msg,
+ enum devlink_param_type type,
+ enum devlink_param_cmode cmode,
+ union devlink_param_value val)
+{
+ struct nlattr *param_value_attr;
+
+ param_value_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUE);
+ if (!param_value_attr)
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode))
+ goto value_nest_cancel;
+
+ switch (type) {
+ case DEVLINK_PARAM_TYPE_U8:
+ if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8))
+ goto value_nest_cancel;
+ break;
+ case DEVLINK_PARAM_TYPE_U16:
+ if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16))
+ goto value_nest_cancel;
+ break;
+ case DEVLINK_PARAM_TYPE_U32:
+ if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32))
+ goto value_nest_cancel;
+ break;
+ case DEVLINK_PARAM_TYPE_STRING:
+ if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA,
+ val.vstr))
+ goto value_nest_cancel;
+ break;
+ case DEVLINK_PARAM_TYPE_BOOL:
+ if (val.vbool &&
+ nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA))
+ goto value_nest_cancel;
+ break;
+ }
+
+ nla_nest_end(msg, param_value_attr);
+ return 0;
+
+value_nest_cancel:
+ nla_nest_cancel(msg, param_value_attr);
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_param_item *param_item,
+ enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1];
+ const struct devlink_param *param = param_item->param;
+ struct devlink_param_gset_ctx ctx;
+ struct nlattr *param_values_list;
+ struct nlattr *param_attr;
+ int nla_type;
+ void *hdr;
+ int err;
+ int i;
+
+ /* Get value from driver part to driverinit configuration mode */
+ for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
+ if (!devlink_param_cmode_is_supported(param, i))
+ continue;
+ if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) {
+ if (!param_item->driverinit_value_valid)
+ return -EOPNOTSUPP;
+ param_value[i] = param_item->driverinit_value;
+ } else {
+ ctx.cmode = i;
+ err = devlink_param_get(devlink, param, &ctx);
+ if (err)
+ return err;
+ param_value[i] = ctx.val;
+ }
+ }
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto genlmsg_cancel;
+ param_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM);
+ if (!param_attr)
+ goto genlmsg_cancel;
+ if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name))
+ goto param_nest_cancel;
+ if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC))
+ goto param_nest_cancel;
+
+ nla_type = devlink_param_type_to_nla_type(param->type);
+ if (nla_type < 0)
+ goto param_nest_cancel;
+ if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type))
+ goto param_nest_cancel;
+
+ param_values_list = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUES_LIST);
+ if (!param_values_list)
+ goto param_nest_cancel;
+
+ for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
+ if (!devlink_param_cmode_is_supported(param, i))
+ continue;
+ err = devlink_nl_param_value_fill_one(msg, param->type,
+ i, param_value[i]);
+ if (err)
+ goto values_list_nest_cancel;
+ }
+
+ nla_nest_end(msg, param_values_list);
+ nla_nest_end(msg, param_attr);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+values_list_nest_cancel:
+ nla_nest_end(msg, param_values_list);
+param_nest_cancel:
+ nla_nest_cancel(msg, param_attr);
+genlmsg_cancel:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_param_notify(struct devlink *devlink,
+ struct devlink_param_item *param_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+ err = devlink_nl_param_fill(msg, devlink, param_item, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink_param_item *param_item;
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(param_item, &devlink->param_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_param_fill(msg, devlink, param_item,
+ DEVLINK_CMD_PARAM_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int
+devlink_param_type_get_from_info(struct genl_info *info,
+ enum devlink_param_type *param_type)
+{
+ if (!info->attrs[DEVLINK_ATTR_PARAM_TYPE])
+ return -EINVAL;
+
+ switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) {
+ case NLA_U8:
+ *param_type = DEVLINK_PARAM_TYPE_U8;
+ break;
+ case NLA_U16:
+ *param_type = DEVLINK_PARAM_TYPE_U16;
+ break;
+ case NLA_U32:
+ *param_type = DEVLINK_PARAM_TYPE_U32;
+ break;
+ case NLA_STRING:
+ *param_type = DEVLINK_PARAM_TYPE_STRING;
+ break;
+ case NLA_FLAG:
+ *param_type = DEVLINK_PARAM_TYPE_BOOL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+devlink_param_value_get_from_info(const struct devlink_param *param,
+ struct genl_info *info,
+ union devlink_param_value *value)
+{
+ struct nlattr *param_data;
+ int len;
+
+ param_data = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA];
+
+ if (param->type != DEVLINK_PARAM_TYPE_BOOL && !param_data)
+ return -EINVAL;
+
+ switch (param->type) {
+ case DEVLINK_PARAM_TYPE_U8:
+ if (nla_len(param_data) != sizeof(u8))
+ return -EINVAL;
+ value->vu8 = nla_get_u8(param_data);
+ break;
+ case DEVLINK_PARAM_TYPE_U16:
+ if (nla_len(param_data) != sizeof(u16))
+ return -EINVAL;
+ value->vu16 = nla_get_u16(param_data);
+ break;
+ case DEVLINK_PARAM_TYPE_U32:
+ if (nla_len(param_data) != sizeof(u32))
+ return -EINVAL;
+ value->vu32 = nla_get_u32(param_data);
+ break;
+ case DEVLINK_PARAM_TYPE_STRING:
+ len = strnlen(nla_data(param_data), nla_len(param_data));
+ if (len == nla_len(param_data) ||
+ len >= __DEVLINK_PARAM_MAX_STRING_VALUE)
+ return -EINVAL;
+ strcpy(value->vstr, nla_data(param_data));
+ break;
+ case DEVLINK_PARAM_TYPE_BOOL:
+ if (param_data && nla_len(param_data))
+ return -EINVAL;
+ value->vbool = nla_get_flag(param_data);
+ break;
+ }
+ return 0;
+}
+
+static struct devlink_param_item *
+devlink_param_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ char *param_name;
+
+ if (!info->attrs[DEVLINK_ATTR_PARAM_NAME])
+ return NULL;
+
+ param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]);
+ return devlink_param_find_by_name(&devlink->param_list, param_name);
+}
+
+static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_param_item *param_item;
+ struct sk_buff *msg;
+ int err;
+
+ param_item = devlink_param_get_from_info(devlink, info);
+ if (!param_item)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_param_fill(msg, devlink, param_item,
+ DEVLINK_CMD_PARAM_GET,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ enum devlink_param_type param_type;
+ struct devlink_param_gset_ctx ctx;
+ enum devlink_param_cmode cmode;
+ struct devlink_param_item *param_item;
+ const struct devlink_param *param;
+ union devlink_param_value value;
+ int err = 0;
+
+ param_item = devlink_param_get_from_info(devlink, info);
+ if (!param_item)
+ return -EINVAL;
+ param = param_item->param;
+ err = devlink_param_type_get_from_info(info, &param_type);
+ if (err)
+ return err;
+ if (param_type != param->type)
+ return -EINVAL;
+ err = devlink_param_value_get_from_info(param, info, &value);
+ if (err)
+ return err;
+ if (param->validate) {
+ err = param->validate(devlink, param->id, value, info->extack);
+ if (err)
+ return err;
+ }
+
+ if (!info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE])
+ return -EINVAL;
+ cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]);
+ if (!devlink_param_cmode_is_supported(param, cmode))
+ return -EOPNOTSUPP;
+
+ if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) {
+ if (param->type == DEVLINK_PARAM_TYPE_STRING)
+ strcpy(param_item->driverinit_value.vstr, value.vstr);
+ else
+ param_item->driverinit_value = value;
+ param_item->driverinit_value_valid = true;
+ } else {
+ if (!param->set)
+ return -EOPNOTSUPP;
+ ctx.val = value;
+ ctx.cmode = cmode;
+ err = devlink_param_set(devlink, param, &ctx);
+ if (err)
+ return err;
+ }
+
+ devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+ return 0;
+}
+
+static int devlink_param_register_one(struct devlink *devlink,
+ const struct devlink_param *param)
+{
+ struct devlink_param_item *param_item;
+
+ if (devlink_param_find_by_name(&devlink->param_list,
+ param->name))
+ return -EEXIST;
+
+ if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT))
+ WARN_ON(param->get || param->set);
+ else
+ WARN_ON(!param->get || !param->set);
+
+ param_item = kzalloc(sizeof(*param_item), GFP_KERNEL);
+ if (!param_item)
+ return -ENOMEM;
+ param_item->param = param;
+
+ list_add_tail(&param_item->list, &devlink->param_list);
+ devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+ return 0;
+}
+
+static void devlink_param_unregister_one(struct devlink *devlink,
+ const struct devlink_param *param)
+{
+ struct devlink_param_item *param_item;
+
+ param_item = devlink_param_find_by_name(&devlink->param_list,
+ param->name);
+ WARN_ON(!param_item);
+ devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_DEL);
+ list_del(&param_item->list);
+ kfree(param_item);
+}
+
+static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_snapshot *snapshot)
+{
+ struct nlattr *snap_attr;
+ int err;
+
+ snap_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOT);
+ if (!snap_attr)
+ return -EINVAL;
+
+ err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id);
+ if (err)
+ goto nla_put_failure;
+
+ nla_nest_end(msg, snap_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, snap_attr);
+ return err;
+}
+
+static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_region *region)
+{
+ struct devlink_snapshot *snapshot;
+ struct nlattr *snapshots_attr;
+ int err;
+
+ snapshots_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOTS);
+ if (!snapshots_attr)
+ return -EINVAL;
+
+ list_for_each_entry(snapshot, &region->snapshot_list, list) {
+ err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(msg, snapshots_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, snapshots_attr);
+ return err;
+}
+
+static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags,
+ struct devlink_region *region)
+{
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->name);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE,
+ region->size,
+ DEVLINK_ATTR_PAD);
+ if (err)
+ goto nla_put_failure;
+
+ err = devlink_nl_region_snapshots_id_put(msg, devlink, region);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+static void devlink_nl_region_notify(struct devlink_region *region,
+ struct devlink_snapshot *snapshot,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = region->devlink;
+ struct sk_buff *msg;
+ void *hdr;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
+ if (!hdr)
+ goto out_free_msg;
+
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto out_cancel_msg;
+
+ err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME,
+ region->name);
+ if (err)
+ goto out_cancel_msg;
+
+ if (snapshot) {
+ err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID,
+ snapshot->id);
+ if (err)
+ goto out_cancel_msg;
+ } else {
+ err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE,
+ region->size, DEVLINK_ATTR_PAD);
+ if (err)
+ goto out_cancel_msg;
+ }
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+
+ return;
+
+out_cancel_msg:
+ genlmsg_cancel(msg, hdr);
+out_free_msg:
+ nlmsg_free(msg);
+}
+
+static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_region *region;
+ const char *region_name;
+ struct sk_buff *msg;
+ int err;
+
+ if (!info->attrs[DEVLINK_ATTR_REGION_NAME])
+ return -EINVAL;
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+ region = devlink_region_get_by_name(devlink, region_name);
+ if (!region)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET,
+ info->snd_portid, info->snd_seq, 0,
+ region);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink_region *region;
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(region, &devlink->region_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_region_fill(msg, devlink,
+ DEVLINK_CMD_REGION_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, region);
+ if (err) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_nl_cmd_region_del(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_snapshot *snapshot;
+ struct devlink_region *region;
+ const char *region_name;
+ u32 snapshot_id;
+
+ if (!info->attrs[DEVLINK_ATTR_REGION_NAME] ||
+ !info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID])
+ return -EINVAL;
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+ snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
+
+ region = devlink_region_get_by_name(devlink, region_name);
+ if (!region)
+ return -EINVAL;
+
+ snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
+ if (!snapshot)
+ return -EINVAL;
+
+ devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL);
+ devlink_region_snapshot_del(snapshot);
+ return 0;
+}
+
+static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ u8 *chunk, u32 chunk_size,
+ u64 addr)
+{
+ struct nlattr *chunk_attr;
+ int err;
+
+ chunk_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_CHUNK);
+ if (!chunk_attr)
+ return -EINVAL;
+
+ err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr,
+ DEVLINK_ATTR_PAD);
+ if (err)
+ goto nla_put_failure;
+
+ nla_nest_end(msg, chunk_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, chunk_attr);
+ return err;
+}
+
+#define DEVLINK_REGION_READ_CHUNK_SIZE 256
+
+static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb,
+ struct devlink *devlink,
+ struct devlink_region *region,
+ struct nlattr **attrs,
+ u64 start_offset,
+ u64 end_offset,
+ bool dump,
+ u64 *new_offset)
+{
+ struct devlink_snapshot *snapshot;
+ u64 curr_offset = start_offset;
+ u32 snapshot_id;
+ int err = 0;
+
+ *new_offset = start_offset;
+
+ snapshot_id = nla_get_u32(attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
+ snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
+ if (!snapshot)
+ return -EINVAL;
+
+ if (end_offset > snapshot->data_len || dump)
+ end_offset = snapshot->data_len;
+
+ while (curr_offset < end_offset) {
+ u32 data_size;
+ u8 *data;
+
+ if (end_offset - curr_offset < DEVLINK_REGION_READ_CHUNK_SIZE)
+ data_size = end_offset - curr_offset;
+ else
+ data_size = DEVLINK_REGION_READ_CHUNK_SIZE;
+
+ data = &snapshot->data[curr_offset];
+ err = devlink_nl_cmd_region_read_chunk_fill(skb, devlink,
+ data, data_size,
+ curr_offset);
+ if (err)
+ break;
+
+ curr_offset += data_size;
+ }
+ *new_offset = curr_offset;
+
+ return err;
+}
+
+static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ u64 ret_offset, start_offset, end_offset = 0;
+ struct nlattr *attrs[DEVLINK_ATTR_MAX + 1];
+ const struct genl_ops *ops = cb->data;
+ struct devlink_region *region;
+ struct nlattr *chunks_attr;
+ const char *region_name;
+ struct devlink *devlink;
+ bool dump = true;
+ void *hdr;
+ int err;
+
+ start_offset = *((u64 *)&cb->args[0]);
+
+ err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize,
+ attrs, DEVLINK_ATTR_MAX, ops->policy, NULL);
+ if (err)
+ goto out;
+
+ devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs);
+ if (IS_ERR(devlink))
+ goto out;
+
+ mutex_lock(&devlink_mutex);
+ mutex_lock(&devlink->lock);
+
+ if (!attrs[DEVLINK_ATTR_REGION_NAME] ||
+ !attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID])
+ goto out_unlock;
+
+ region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]);
+ region = devlink_region_get_by_name(devlink, region_name);
+ if (!region)
+ goto out_unlock;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI,
+ DEVLINK_CMD_REGION_READ);
+ if (!hdr)
+ goto out_unlock;
+
+ err = devlink_nl_put_handle(skb, devlink);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name);
+ if (err)
+ goto nla_put_failure;
+
+ chunks_attr = nla_nest_start(skb, DEVLINK_ATTR_REGION_CHUNKS);
+ if (!chunks_attr)
+ goto nla_put_failure;
+
+ if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
+ attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) {
+ if (!start_offset)
+ start_offset =
+ nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+
+ end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+ end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
+ dump = false;
+ }
+
+ err = devlink_nl_region_read_snapshot_fill(skb, devlink,
+ region, attrs,
+ start_offset,
+ end_offset, dump,
+ &ret_offset);
+
+ if (err && err != -EMSGSIZE)
+ goto nla_put_failure;
+
+ /* Check if there was any progress done to prevent infinite loop */
+ if (ret_offset == start_offset)
+ goto nla_put_failure;
+
+ *((u64 *)&cb->args[0]) = ret_offset;
+
+ nla_nest_end(skb, chunks_attr);
+ genlmsg_end(skb, hdr);
+ mutex_unlock(&devlink->lock);
+ mutex_unlock(&devlink_mutex);
+
+ return skb->len;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+out_unlock:
+ mutex_unlock(&devlink->lock);
+ mutex_unlock(&devlink_mutex);
+out:
+ return 0;
+}
+
+static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_SB_POOL_TYPE] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_SB_POOL_SIZE] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64},
+ [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64},
+ [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_REGION_CHUNK_ADDR] = { .type = NLA_U64 },
+ [DEVLINK_ATTR_REGION_CHUNK_LEN] = { .type = NLA_U64 },
+};
+
+static const struct genl_ops devlink_nl_ops[] = {
+ {
+ .cmd = DEVLINK_CMD_GET,
+ .doit = devlink_nl_cmd_get_doit,
+ .dumpit = devlink_nl_cmd_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_GET,
+ .doit = devlink_nl_cmd_port_get_doit,
+ .dumpit = devlink_nl_cmd_port_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_SET,
+ .doit = devlink_nl_cmd_port_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_SPLIT,
+ .doit = devlink_nl_cmd_port_split_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NO_LOCK,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_UNSPLIT,
+ .doit = devlink_nl_cmd_port_unsplit_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NO_LOCK,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_GET,
+ .doit = devlink_nl_cmd_sb_get_doit,
+ .dumpit = devlink_nl_cmd_sb_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NEED_SB,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_POOL_GET,
+ .doit = devlink_nl_cmd_sb_pool_get_doit,
+ .dumpit = devlink_nl_cmd_sb_pool_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NEED_SB,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_POOL_SET,
+ .doit = devlink_nl_cmd_sb_pool_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NEED_SB,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_PORT_POOL_GET,
+ .doit = devlink_nl_cmd_sb_port_pool_get_doit,
+ .dumpit = devlink_nl_cmd_sb_port_pool_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
+ DEVLINK_NL_FLAG_NEED_SB,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_PORT_POOL_SET,
+ .doit = devlink_nl_cmd_sb_port_pool_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
+ DEVLINK_NL_FLAG_NEED_SB,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET,
+ .doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit,
+ .dumpit = devlink_nl_cmd_sb_tc_pool_bind_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
+ DEVLINK_NL_FLAG_NEED_SB,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET,
+ .doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
+ DEVLINK_NL_FLAG_NEED_SB,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT,
+ .doit = devlink_nl_cmd_sb_occ_snapshot_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NEED_SB,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR,
+ .doit = devlink_nl_cmd_sb_occ_max_clear_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NEED_SB,
+ },
+ {
+ .cmd = DEVLINK_CMD_ESWITCH_GET,
+ .doit = devlink_nl_cmd_eswitch_get_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_ESWITCH_SET,
+ .doit = devlink_nl_cmd_eswitch_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NO_LOCK,
+ },
+ {
+ .cmd = DEVLINK_CMD_DPIPE_TABLE_GET,
+ .doit = devlink_nl_cmd_dpipe_table_get,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET,
+ .doit = devlink_nl_cmd_dpipe_entries_get,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET,
+ .doit = devlink_nl_cmd_dpipe_headers_get,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
+ .doit = devlink_nl_cmd_dpipe_table_counters_set,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_RESOURCE_SET,
+ .doit = devlink_nl_cmd_resource_set,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_RESOURCE_DUMP,
+ .doit = devlink_nl_cmd_resource_dump,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_RELOAD,
+ .doit = devlink_nl_cmd_reload,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NO_LOCK,
+ },
+ {
+ .cmd = DEVLINK_CMD_PARAM_GET,
+ .doit = devlink_nl_cmd_param_get_doit,
+ .dumpit = devlink_nl_cmd_param_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_PARAM_SET,
+ .doit = devlink_nl_cmd_param_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_GET,
+ .doit = devlink_nl_cmd_region_get_doit,
+ .dumpit = devlink_nl_cmd_region_get_dumpit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_DEL,
+ .doit = devlink_nl_cmd_region_del,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_READ,
+ .dumpit = devlink_nl_cmd_region_read_dumpit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+};
+
+static struct genl_family devlink_nl_family __ro_after_init = {
+ .name = DEVLINK_GENL_NAME,
+ .version = DEVLINK_GENL_VERSION,
+ .maxattr = DEVLINK_ATTR_MAX,
+ .netnsok = true,
+ .pre_doit = devlink_nl_pre_doit,
+ .post_doit = devlink_nl_post_doit,
+ .module = THIS_MODULE,
+ .ops = devlink_nl_ops,
+ .n_ops = ARRAY_SIZE(devlink_nl_ops),
+ .mcgrps = devlink_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps),
+};
+
+/**
+ * devlink_alloc - Allocate new devlink instance resources
+ *
+ * @ops: ops
+ * @priv_size: size of user private data
+ *
+ * Allocate new devlink instance resources, including devlink index
+ * and name.
+ */
+struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
+{
+ struct devlink *devlink;
+
+ devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL);
+ if (!devlink)
+ return NULL;
+ devlink->ops = ops;
+ devlink_net_set(devlink, &init_net);
+ INIT_LIST_HEAD(&devlink->port_list);
+ INIT_LIST_HEAD(&devlink->sb_list);
+ INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
+ INIT_LIST_HEAD(&devlink->resource_list);
+ INIT_LIST_HEAD(&devlink->param_list);
+ INIT_LIST_HEAD(&devlink->region_list);
+ mutex_init(&devlink->lock);
+ return devlink;
+}
+EXPORT_SYMBOL_GPL(devlink_alloc);
+
+/**
+ * devlink_register - Register devlink instance
+ *
+ * @devlink: devlink
+ */
+int devlink_register(struct devlink *devlink, struct device *dev)
+{
+ mutex_lock(&devlink_mutex);
+ devlink->dev = dev;
+ list_add_tail(&devlink->list, &devlink_list);
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+ mutex_unlock(&devlink_mutex);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_register);
+
+/**
+ * devlink_unregister - Unregister devlink instance
+ *
+ * @devlink: devlink
+ */
+void devlink_unregister(struct devlink *devlink)
+{
+ mutex_lock(&devlink_mutex);
+ devlink_notify(devlink, DEVLINK_CMD_DEL);
+ list_del(&devlink->list);
+ mutex_unlock(&devlink_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_unregister);
+
+/**
+ * devlink_free - Free devlink instance resources
+ *
+ * @devlink: devlink
+ */
+void devlink_free(struct devlink *devlink)
+{
+ kfree(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_free);
+
+/**
+ * devlink_port_register - Register devlink port
+ *
+ * @devlink: devlink
+ * @devlink_port: devlink port
+ * @port_index
+ *
+ * Register devlink port with provided port index. User can use
+ * any indexing, even hw-related one. devlink_port structure
+ * is convenient to be embedded inside user driver private structure.
+ * Note that the caller should take care of zeroing the devlink_port
+ * structure.
+ */
+int devlink_port_register(struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ unsigned int port_index)
+{
+ mutex_lock(&devlink->lock);
+ if (devlink_port_index_exists(devlink, port_index)) {
+ mutex_unlock(&devlink->lock);
+ return -EEXIST;
+ }
+ devlink_port->devlink = devlink;
+ devlink_port->index = port_index;
+ devlink_port->registered = true;
+ list_add_tail(&devlink_port->list, &devlink->port_list);
+ mutex_unlock(&devlink->lock);
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_port_register);
+
+/**
+ * devlink_port_unregister - Unregister devlink port
+ *
+ * @devlink_port: devlink port
+ */
+void devlink_port_unregister(struct devlink_port *devlink_port)
+{
+ struct devlink *devlink = devlink_port->devlink;
+
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
+ mutex_lock(&devlink->lock);
+ list_del(&devlink_port->list);
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_port_unregister);
+
+static void __devlink_port_type_set(struct devlink_port *devlink_port,
+ enum devlink_port_type type,
+ void *type_dev)
+{
+ devlink_port->type = type;
+ devlink_port->type_dev = type_dev;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+
+/**
+ * devlink_port_type_eth_set - Set port type to Ethernet
+ *
+ * @devlink_port: devlink port
+ * @netdev: related netdevice
+ */
+void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+ struct net_device *netdev)
+{
+ return __devlink_port_type_set(devlink_port,
+ DEVLINK_PORT_TYPE_ETH, netdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
+
+/**
+ * devlink_port_type_ib_set - Set port type to InfiniBand
+ *
+ * @devlink_port: devlink port
+ * @ibdev: related IB device
+ */
+void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+ struct ib_device *ibdev)
+{
+ return __devlink_port_type_set(devlink_port,
+ DEVLINK_PORT_TYPE_IB, ibdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
+
+/**
+ * devlink_port_type_clear - Clear port type
+ *
+ * @devlink_port: devlink port
+ */
+void devlink_port_type_clear(struct devlink_port *devlink_port)
+{
+ return __devlink_port_type_set(devlink_port,
+ DEVLINK_PORT_TYPE_NOTSET, NULL);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_clear);
+
+/**
+ * devlink_port_attrs_set - Set port attributes
+ *
+ * @devlink_port: devlink port
+ * @flavour: flavour of the port
+ * @port_number: number of the port that is facing user, for example
+ * the front panel port number
+ * @split: indicates if this is split port
+ * @split_subport_number: if the port is split, this is the number
+ * of subport.
+ */
+void devlink_port_attrs_set(struct devlink_port *devlink_port,
+ enum devlink_port_flavour flavour,
+ u32 port_number, bool split,
+ u32 split_subport_number)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ attrs->set = true;
+ attrs->flavour = flavour;
+ attrs->port_number = port_number;
+ attrs->split = split;
+ attrs->split_subport_number = split_subport_number;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
+
+int devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
+ char *name, size_t len)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+ int n = 0;
+
+ if (!attrs->set)
+ return -EOPNOTSUPP;
+
+ switch (attrs->flavour) {
+ case DEVLINK_PORT_FLAVOUR_PHYSICAL:
+ if (!attrs->split)
+ n = snprintf(name, len, "p%u", attrs->port_number);
+ else
+ n = snprintf(name, len, "p%us%u", attrs->port_number,
+ attrs->split_subport_number);
+ break;
+ case DEVLINK_PORT_FLAVOUR_CPU:
+ case DEVLINK_PORT_FLAVOUR_DSA:
+ /* As CPU and DSA ports do not have a netdevice associated
+ * case should not ever happen.
+ */
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if (n >= len)
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_port_get_phys_port_name);
+
+int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
+ u32 size, u16 ingress_pools_count,
+ u16 egress_pools_count, u16 ingress_tc_count,
+ u16 egress_tc_count)
+{
+ struct devlink_sb *devlink_sb;
+ int err = 0;
+
+ mutex_lock(&devlink->lock);
+ if (devlink_sb_index_exists(devlink, sb_index)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL);
+ if (!devlink_sb) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+ devlink_sb->index = sb_index;
+ devlink_sb->size = size;
+ devlink_sb->ingress_pools_count = ingress_pools_count;
+ devlink_sb->egress_pools_count = egress_pools_count;
+ devlink_sb->ingress_tc_count = ingress_tc_count;
+ devlink_sb->egress_tc_count = egress_tc_count;
+ list_add_tail(&devlink_sb->list, &devlink->sb_list);
+unlock:
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_sb_register);
+
+void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
+{
+ struct devlink_sb *devlink_sb;
+
+ mutex_lock(&devlink->lock);
+ devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
+ WARN_ON(!devlink_sb);
+ list_del(&devlink_sb->list);
+ mutex_unlock(&devlink->lock);
+ kfree(devlink_sb);
+}
+EXPORT_SYMBOL_GPL(devlink_sb_unregister);
+
+/**
+ * devlink_dpipe_headers_register - register dpipe headers
+ *
+ * @devlink: devlink
+ * @dpipe_headers: dpipe header array
+ *
+ * Register the headers supported by hardware.
+ */
+int devlink_dpipe_headers_register(struct devlink *devlink,
+ struct devlink_dpipe_headers *dpipe_headers)
+{
+ mutex_lock(&devlink->lock);
+ devlink->dpipe_headers = dpipe_headers;
+ mutex_unlock(&devlink->lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register);
+
+/**
+ * devlink_dpipe_headers_unregister - unregister dpipe headers
+ *
+ * @devlink: devlink
+ *
+ * Unregister the headers supported by hardware.
+ */
+void devlink_dpipe_headers_unregister(struct devlink *devlink)
+{
+ mutex_lock(&devlink->lock);
+ devlink->dpipe_headers = NULL;
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister);
+
+/**
+ * devlink_dpipe_table_counter_enabled - check if counter allocation
+ * required
+ * @devlink: devlink
+ * @table_name: tables name
+ *
+ * Used by driver to check if counter allocation is required.
+ * After counter allocation is turned on the table entries
+ * are updated to include counter statistics.
+ *
+ * After that point on the driver must respect the counter
+ * state so that each entry added to the table is added
+ * with a counter.
+ */
+bool devlink_dpipe_table_counter_enabled(struct devlink *devlink,
+ const char *table_name)
+{
+ struct devlink_dpipe_table *table;
+ bool enabled;
+
+ rcu_read_lock();
+ table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+ table_name);
+ enabled = false;
+ if (table)
+ enabled = table->counters_enabled;
+ rcu_read_unlock();
+ return enabled;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled);
+
+/**
+ * devlink_dpipe_table_register - register dpipe table
+ *
+ * @devlink: devlink
+ * @table_name: table name
+ * @table_ops: table ops
+ * @priv: priv
+ * @counter_control_extern: external control for counters
+ */
+int devlink_dpipe_table_register(struct devlink *devlink,
+ const char *table_name,
+ struct devlink_dpipe_table_ops *table_ops,
+ void *priv, bool counter_control_extern)
+{
+ struct devlink_dpipe_table *table;
+
+ if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name))
+ return -EEXIST;
+
+ if (WARN_ON(!table_ops->size_get))
+ return -EINVAL;
+
+ table = kzalloc(sizeof(*table), GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+
+ table->name = table_name;
+ table->table_ops = table_ops;
+ table->priv = priv;
+ table->counter_control_extern = counter_control_extern;
+
+ mutex_lock(&devlink->lock);
+ list_add_tail_rcu(&table->list, &devlink->dpipe_table_list);
+ mutex_unlock(&devlink->lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_table_register);
+
+/**
+ * devlink_dpipe_table_unregister - unregister dpipe table
+ *
+ * @devlink: devlink
+ * @table_name: table name
+ */
+void devlink_dpipe_table_unregister(struct devlink *devlink,
+ const char *table_name)
+{
+ struct devlink_dpipe_table *table;
+
+ mutex_lock(&devlink->lock);
+ table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+ table_name);
+ if (!table)
+ goto unlock;
+ list_del_rcu(&table->list);
+ mutex_unlock(&devlink->lock);
+ kfree_rcu(table, rcu);
+ return;
+unlock:
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
+
+/**
+ * devlink_resource_register - devlink resource register
+ *
+ * @devlink: devlink
+ * @resource_name: resource's name
+ * @top_hierarchy: top hierarchy
+ * @reload_required: reload is required for new configuration to
+ * apply
+ * @resource_size: resource's size
+ * @resource_id: resource's id
+ * @parent_reosurce_id: resource's parent id
+ * @size params: size parameters
+ */
+int devlink_resource_register(struct devlink *devlink,
+ const char *resource_name,
+ u64 resource_size,
+ u64 resource_id,
+ u64 parent_resource_id,
+ const struct devlink_resource_size_params *size_params)
+{
+ struct devlink_resource *resource;
+ struct list_head *resource_list;
+ bool top_hierarchy;
+ int err = 0;
+
+ top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP;
+
+ mutex_lock(&devlink->lock);
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (resource) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ resource = kzalloc(sizeof(*resource), GFP_KERNEL);
+ if (!resource) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (top_hierarchy) {
+ resource_list = &devlink->resource_list;
+ } else {
+ struct devlink_resource *parent_resource;
+
+ parent_resource = devlink_resource_find(devlink, NULL,
+ parent_resource_id);
+ if (parent_resource) {
+ resource_list = &parent_resource->resource_list;
+ resource->parent = parent_resource;
+ } else {
+ kfree(resource);
+ err = -EINVAL;
+ goto out;
+ }
+ }
+
+ resource->name = resource_name;
+ resource->size = resource_size;
+ resource->size_new = resource_size;
+ resource->id = resource_id;
+ resource->size_valid = true;
+ memcpy(&resource->size_params, size_params,
+ sizeof(resource->size_params));
+ INIT_LIST_HEAD(&resource->resource_list);
+ list_add_tail(&resource->list, resource_list);
+out:
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_resource_register);
+
+/**
+ * devlink_resources_unregister - free all resources
+ *
+ * @devlink: devlink
+ * @resource: resource
+ */
+void devlink_resources_unregister(struct devlink *devlink,
+ struct devlink_resource *resource)
+{
+ struct devlink_resource *tmp, *child_resource;
+ struct list_head *resource_list;
+
+ if (resource)
+ resource_list = &resource->resource_list;
+ else
+ resource_list = &devlink->resource_list;
+
+ if (!resource)
+ mutex_lock(&devlink->lock);
+
+ list_for_each_entry_safe(child_resource, tmp, resource_list, list) {
+ devlink_resources_unregister(devlink, child_resource);
+ list_del(&child_resource->list);
+ kfree(child_resource);
+ }
+
+ if (!resource)
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_resources_unregister);
+
+/**
+ * devlink_resource_size_get - get and update size
+ *
+ * @devlink: devlink
+ * @resource_id: the requested resource id
+ * @p_resource_size: ptr to update
+ */
+int devlink_resource_size_get(struct devlink *devlink,
+ u64 resource_id,
+ u64 *p_resource_size)
+{
+ struct devlink_resource *resource;
+ int err = 0;
+
+ mutex_lock(&devlink->lock);
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (!resource) {
+ err = -EINVAL;
+ goto out;
+ }
+ *p_resource_size = resource->size_new;
+ resource->size = resource->size_new;
+out:
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_resource_size_get);
+
+/**
+ * devlink_dpipe_table_resource_set - set the resource id
+ *
+ * @devlink: devlink
+ * @table_name: table name
+ * @resource_id: resource id
+ * @resource_units: number of resource's units consumed per table's entry
+ */
+int devlink_dpipe_table_resource_set(struct devlink *devlink,
+ const char *table_name, u64 resource_id,
+ u64 resource_units)
+{
+ struct devlink_dpipe_table *table;
+ int err = 0;
+
+ mutex_lock(&devlink->lock);
+ table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+ table_name);
+ if (!table) {
+ err = -EINVAL;
+ goto out;
+ }
+ table->resource_id = resource_id;
+ table->resource_units = resource_units;
+ table->resource_valid = true;
+out:
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set);
+
+/**
+ * devlink_resource_occ_get_register - register occupancy getter
+ *
+ * @devlink: devlink
+ * @resource_id: resource id
+ * @occ_get: occupancy getter callback
+ * @occ_get_priv: occupancy getter callback priv
+ */
+void devlink_resource_occ_get_register(struct devlink *devlink,
+ u64 resource_id,
+ devlink_resource_occ_get_t *occ_get,
+ void *occ_get_priv)
+{
+ struct devlink_resource *resource;
+
+ mutex_lock(&devlink->lock);
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (WARN_ON(!resource))
+ goto out;
+ WARN_ON(resource->occ_get);
+
+ resource->occ_get = occ_get;
+ resource->occ_get_priv = occ_get_priv;
+out:
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register);
+
+/**
+ * devlink_resource_occ_get_unregister - unregister occupancy getter
+ *
+ * @devlink: devlink
+ * @resource_id: resource id
+ */
+void devlink_resource_occ_get_unregister(struct devlink *devlink,
+ u64 resource_id)
+{
+ struct devlink_resource *resource;
+
+ mutex_lock(&devlink->lock);
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (WARN_ON(!resource))
+ goto out;
+ WARN_ON(!resource->occ_get);
+
+ resource->occ_get = NULL;
+ resource->occ_get_priv = NULL;
+out:
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister);
+
+/**
+ * devlink_params_register - register configuration parameters
+ *
+ * @devlink: devlink
+ * @params: configuration parameters array
+ * @params_count: number of parameters provided
+ *
+ * Register the configuration parameters supported by the driver.
+ */
+int devlink_params_register(struct devlink *devlink,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ const struct devlink_param *param = params;
+ int i;
+ int err;
+
+ mutex_lock(&devlink->lock);
+ for (i = 0; i < params_count; i++, param++) {
+ if (!param || !param->name || !param->supported_cmodes) {
+ err = -EINVAL;
+ goto rollback;
+ }
+ if (param->generic) {
+ err = devlink_param_generic_verify(param);
+ if (err)
+ goto rollback;
+ } else {
+ err = devlink_param_driver_verify(param);
+ if (err)
+ goto rollback;
+ }
+ err = devlink_param_register_one(devlink, param);
+ if (err)
+ goto rollback;
+ }
+
+ mutex_unlock(&devlink->lock);
+ return 0;
+
+rollback:
+ if (!i)
+ goto unlock;
+ for (param--; i > 0; i--, param--)
+ devlink_param_unregister_one(devlink, param);
+unlock:
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_params_register);
+
+/**
+ * devlink_params_unregister - unregister configuration parameters
+ * @devlink: devlink
+ * @params: configuration parameters to unregister
+ * @params_count: number of parameters provided
+ */
+void devlink_params_unregister(struct devlink *devlink,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ const struct devlink_param *param = params;
+ int i;
+
+ mutex_lock(&devlink->lock);
+ for (i = 0; i < params_count; i++, param++)
+ devlink_param_unregister_one(devlink, param);
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_params_unregister);
+
+/**
+ * devlink_param_driverinit_value_get - get configuration parameter
+ * value for driver initializing
+ *
+ * @devlink: devlink
+ * @param_id: parameter ID
+ * @init_val: value of parameter in driverinit configuration mode
+ *
+ * This function should be used by the driver to get driverinit
+ * configuration for initialization after reload command.
+ */
+int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
+ union devlink_param_value *init_val)
+{
+ struct devlink_param_item *param_item;
+
+ if (!devlink->ops || !devlink->ops->reload)
+ return -EOPNOTSUPP;
+
+ param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
+ if (!param_item)
+ return -EINVAL;
+
+ if (!param_item->driverinit_value_valid ||
+ !devlink_param_cmode_is_supported(param_item->param,
+ DEVLINK_PARAM_CMODE_DRIVERINIT))
+ return -EOPNOTSUPP;
+
+ if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING)
+ strcpy(init_val->vstr, param_item->driverinit_value.vstr);
+ else
+ *init_val = param_item->driverinit_value;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get);
+
+/**
+ * devlink_param_driverinit_value_set - set value of configuration
+ * parameter for driverinit
+ * configuration mode
+ *
+ * @devlink: devlink
+ * @param_id: parameter ID
+ * @init_val: value of parameter to set for driverinit configuration mode
+ *
+ * This function should be used by the driver to set driverinit
+ * configuration mode default value.
+ */
+int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
+ union devlink_param_value init_val)
+{
+ struct devlink_param_item *param_item;
+
+ param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
+ if (!param_item)
+ return -EINVAL;
+
+ if (!devlink_param_cmode_is_supported(param_item->param,
+ DEVLINK_PARAM_CMODE_DRIVERINIT))
+ return -EOPNOTSUPP;
+
+ if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING)
+ strcpy(param_item->driverinit_value.vstr, init_val.vstr);
+ else
+ param_item->driverinit_value = init_val;
+ param_item->driverinit_value_valid = true;
+
+ devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set);
+
+/**
+ * devlink_param_value_changed - notify devlink on a parameter's value
+ * change. Should be called by the driver
+ * right after the change.
+ *
+ * @devlink: devlink
+ * @param_id: parameter ID
+ *
+ * This function should be used by the driver to notify devlink on value
+ * change, excluding driverinit configuration mode.
+ * For driverinit configuration mode driver should use the function
+ * devlink_param_driverinit_value_set() instead.
+ */
+void devlink_param_value_changed(struct devlink *devlink, u32 param_id)
+{
+ struct devlink_param_item *param_item;
+
+ param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
+ WARN_ON(!param_item);
+
+ devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+}
+EXPORT_SYMBOL_GPL(devlink_param_value_changed);
+
+/**
+ * devlink_param_value_str_fill - Safely fill-up the string preventing
+ * from overflow of the preallocated buffer
+ *
+ * @dst_val: destination devlink_param_value
+ * @src: source buffer
+ */
+void devlink_param_value_str_fill(union devlink_param_value *dst_val,
+ const char *src)
+{
+ size_t len;
+
+ len = strlcpy(dst_val->vstr, src, __DEVLINK_PARAM_MAX_STRING_VALUE);
+ WARN_ON(len >= __DEVLINK_PARAM_MAX_STRING_VALUE);
+}
+EXPORT_SYMBOL_GPL(devlink_param_value_str_fill);
+
+/**
+ * devlink_region_create - create a new address region
+ *
+ * @devlink: devlink
+ * @region_name: region name
+ * @region_max_snapshots: Maximum supported number of snapshots for region
+ * @region_size: size of region
+ */
+struct devlink_region *devlink_region_create(struct devlink *devlink,
+ const char *region_name,
+ u32 region_max_snapshots,
+ u64 region_size)
+{
+ struct devlink_region *region;
+ int err = 0;
+
+ mutex_lock(&devlink->lock);
+
+ if (devlink_region_get_by_name(devlink, region_name)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL);
+ if (!region) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ region->devlink = devlink;
+ region->max_snapshots = region_max_snapshots;
+ region->name = region_name;
+ region->size = region_size;
+ INIT_LIST_HEAD(&region->snapshot_list);
+ list_add_tail(&region->list, &devlink->region_list);
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
+
+ mutex_unlock(&devlink->lock);
+ return region;
+
+unlock:
+ mutex_unlock(&devlink->lock);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(devlink_region_create);
+
+/**
+ * devlink_region_destroy - destroy address region
+ *
+ * @region: devlink region to destroy
+ */
+void devlink_region_destroy(struct devlink_region *region)
+{
+ struct devlink *devlink = region->devlink;
+ struct devlink_snapshot *snapshot, *ts;
+
+ mutex_lock(&devlink->lock);
+
+ /* Free all snapshots of region */
+ list_for_each_entry_safe(snapshot, ts, &region->snapshot_list, list)
+ devlink_region_snapshot_del(snapshot);
+
+ list_del(&region->list);
+
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL);
+ mutex_unlock(&devlink->lock);
+ kfree(region);
+}
+EXPORT_SYMBOL_GPL(devlink_region_destroy);
+
+/**
+ * devlink_region_shapshot_id_get - get snapshot ID
+ *
+ * This callback should be called when adding a new snapshot,
+ * Driver should use the same id for multiple snapshots taken
+ * on multiple regions at the same time/by the same trigger.
+ *
+ * @devlink: devlink
+ */
+u32 devlink_region_shapshot_id_get(struct devlink *devlink)
+{
+ u32 id;
+
+ mutex_lock(&devlink->lock);
+ id = ++devlink->snapshot_id;
+ mutex_unlock(&devlink->lock);
+
+ return id;
+}
+EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get);
+
+/**
+ * devlink_region_snapshot_create - create a new snapshot
+ * This will add a new snapshot of a region. The snapshot
+ * will be stored on the region struct and can be accessed
+ * from devlink. This is useful for future analyses of snapshots.
+ * Multiple snapshots can be created on a region.
+ * The @snapshot_id should be obtained using the getter function.
+ *
+ * @devlink_region: devlink region of the snapshot
+ * @data_len: size of snapshot data
+ * @data: snapshot data
+ * @snapshot_id: snapshot id to be created
+ * @data_destructor: pointer to destructor function to free data
+ */
+int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
+ u8 *data, u32 snapshot_id,
+ devlink_snapshot_data_dest_t *data_destructor)
+{
+ struct devlink *devlink = region->devlink;
+ struct devlink_snapshot *snapshot;
+ int err;
+
+ mutex_lock(&devlink->lock);
+
+ /* check if region can hold one more snapshot */
+ if (region->cur_snapshots == region->max_snapshots) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
+ if (!snapshot) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ snapshot->id = snapshot_id;
+ snapshot->region = region;
+ snapshot->data = data;
+ snapshot->data_len = data_len;
+ snapshot->data_destructor = data_destructor;
+
+ list_add_tail(&snapshot->list, &region->snapshot_list);
+
+ region->cur_snapshots++;
+
+ devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
+ mutex_unlock(&devlink->lock);
+ return 0;
+
+unlock:
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
+
+static int __init devlink_module_init(void)
+{
+ return genl_register_family(&devlink_nl_family);
+}
+
+static void __exit devlink_module_exit(void)
+{
+ genl_unregister_family(&devlink_nl_family);
+}
+
+module_init(devlink_module_init);
+module_exit(devlink_module_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Network physical device Netlink interface");
+MODULE_ALIAS_GENL_FAMILY(DEVLINK_GENL_NAME);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
new file mode 100644
index 000000000..2ed600012
--- /dev/null
+++ b/net/core/drop_monitor.c
@@ -0,0 +1,472 @@
+/*
+ * Monitoring code for network dropped packet alerts
+ *
+ * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/netlink.h>
+#include <linux/net_dropmon.h>
+#include <linux/percpu.h>
+#include <linux/timer.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/genetlink.h>
+#include <net/netevent.h>
+
+#include <trace/events/skb.h>
+#include <trace/events/napi.h>
+
+#include <asm/unaligned.h>
+
+#define TRACE_ON 1
+#define TRACE_OFF 0
+
+/*
+ * Globals, our netlink socket pointer
+ * and the work handle that will send up
+ * netlink alerts
+ */
+static int trace_state = TRACE_OFF;
+static DEFINE_MUTEX(trace_state_mutex);
+
+struct per_cpu_dm_data {
+ spinlock_t lock;
+ struct sk_buff *skb;
+ struct work_struct dm_alert_work;
+ struct timer_list send_timer;
+};
+
+struct dm_hw_stat_delta {
+ struct net_device *dev;
+ unsigned long last_rx;
+ struct list_head list;
+ struct rcu_head rcu;
+ unsigned long last_drop_val;
+};
+
+static struct genl_family net_drop_monitor_family;
+
+static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
+
+static int dm_hit_limit = 64;
+static int dm_delay = 1;
+static unsigned long dm_hw_check_delta = 2*HZ;
+static LIST_HEAD(hw_stats_list);
+
+static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
+{
+ size_t al;
+ struct net_dm_alert_msg *msg;
+ struct nlattr *nla;
+ struct sk_buff *skb;
+ unsigned long flags;
+ void *msg_header;
+
+ al = sizeof(struct net_dm_alert_msg);
+ al += dm_hit_limit * sizeof(struct net_dm_drop_point);
+ al += sizeof(struct nlattr);
+
+ skb = genlmsg_new(al, GFP_KERNEL);
+
+ if (!skb)
+ goto err;
+
+ msg_header = genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
+ 0, NET_DM_CMD_ALERT);
+ if (!msg_header) {
+ nlmsg_free(skb);
+ skb = NULL;
+ goto err;
+ }
+ nla = nla_reserve(skb, NLA_UNSPEC,
+ sizeof(struct net_dm_alert_msg));
+ if (!nla) {
+ nlmsg_free(skb);
+ skb = NULL;
+ goto err;
+ }
+ msg = nla_data(nla);
+ memset(msg, 0, al);
+ goto out;
+
+err:
+ mod_timer(&data->send_timer, jiffies + HZ / 10);
+out:
+ spin_lock_irqsave(&data->lock, flags);
+ swap(data->skb, skb);
+ spin_unlock_irqrestore(&data->lock, flags);
+
+ if (skb) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
+ struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlh);
+
+ genlmsg_end(skb, genlmsg_data(gnlh));
+ }
+
+ return skb;
+}
+
+static const struct genl_multicast_group dropmon_mcgrps[] = {
+ { .name = "events", },
+};
+
+static void send_dm_alert(struct work_struct *work)
+{
+ struct sk_buff *skb;
+ struct per_cpu_dm_data *data;
+
+ data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+ skb = reset_per_cpu_data(data);
+
+ if (skb)
+ genlmsg_multicast(&net_drop_monitor_family, skb, 0,
+ 0, GFP_KERNEL);
+}
+
+/*
+ * This is the timer function to delay the sending of an alert
+ * in the event that more drops will arrive during the
+ * hysteresis period.
+ */
+static void sched_send_work(struct timer_list *t)
+{
+ struct per_cpu_dm_data *data = from_timer(data, t, send_timer);
+
+ schedule_work(&data->dm_alert_work);
+}
+
+static void trace_drop_common(struct sk_buff *skb, void *location)
+{
+ struct net_dm_alert_msg *msg;
+ struct net_dm_drop_point *point;
+ struct nlmsghdr *nlh;
+ struct nlattr *nla;
+ int i;
+ struct sk_buff *dskb;
+ struct per_cpu_dm_data *data;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ data = this_cpu_ptr(&dm_cpu_data);
+ spin_lock(&data->lock);
+ dskb = data->skb;
+
+ if (!dskb)
+ goto out;
+
+ nlh = (struct nlmsghdr *)dskb->data;
+ nla = genlmsg_data(nlmsg_data(nlh));
+ msg = nla_data(nla);
+ point = msg->points;
+ for (i = 0; i < msg->entries; i++) {
+ if (!memcmp(&location, &point->pc, sizeof(void *))) {
+ point->count++;
+ goto out;
+ }
+ point++;
+ }
+ if (msg->entries == dm_hit_limit)
+ goto out;
+ /*
+ * We need to create a new entry
+ */
+ __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
+ nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
+ memcpy(point->pc, &location, sizeof(void *));
+ point->count = 1;
+ msg->entries++;
+
+ if (!timer_pending(&data->send_timer)) {
+ data->send_timer.expires = jiffies + dm_delay * HZ;
+ add_timer(&data->send_timer);
+ }
+
+out:
+ spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
+{
+ trace_drop_common(skb, location);
+}
+
+static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi,
+ int work, int budget)
+{
+ struct dm_hw_stat_delta *new_stat;
+
+ /*
+ * Don't check napi structures with no associated device
+ */
+ if (!napi->dev)
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+ struct net_device *dev;
+
+ /*
+ * only add a note to our monitor buffer if:
+ * 1) this is the dev we received on
+ * 2) its after the last_rx delta
+ * 3) our rx_dropped count has gone up
+ */
+ /* Paired with WRITE_ONCE() in dropmon_net_event() */
+ dev = READ_ONCE(new_stat->dev);
+ if ((dev == napi->dev) &&
+ (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) &&
+ (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+ trace_drop_common(NULL, NULL);
+ new_stat->last_drop_val = napi->dev->stats.rx_dropped;
+ new_stat->last_rx = jiffies;
+ break;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static int set_all_monitor_traces(int state)
+{
+ int rc = 0;
+ struct dm_hw_stat_delta *new_stat = NULL;
+ struct dm_hw_stat_delta *temp;
+
+ mutex_lock(&trace_state_mutex);
+
+ if (state == trace_state) {
+ rc = -EAGAIN;
+ goto out_unlock;
+ }
+
+ switch (state) {
+ case TRACE_ON:
+ if (!try_module_get(THIS_MODULE)) {
+ rc = -ENODEV;
+ break;
+ }
+
+ rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
+ rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
+ break;
+
+ case TRACE_OFF:
+ rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
+ rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
+
+ tracepoint_synchronize_unregister();
+
+ /*
+ * Clean the device list
+ */
+ list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
+ if (new_stat->dev == NULL) {
+ list_del_rcu(&new_stat->list);
+ kfree_rcu(new_stat, rcu);
+ }
+ }
+
+ module_put(THIS_MODULE);
+
+ break;
+ default:
+ rc = 1;
+ break;
+ }
+
+ if (!rc)
+ trace_state = state;
+ else
+ rc = -EINPROGRESS;
+
+out_unlock:
+ mutex_unlock(&trace_state_mutex);
+
+ return rc;
+}
+
+
+static int net_dm_cmd_config(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ return -ENOTSUPP;
+}
+
+static int net_dm_cmd_trace(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ switch (info->genlhdr->cmd) {
+ case NET_DM_CMD_START:
+ return set_all_monitor_traces(TRACE_ON);
+ case NET_DM_CMD_STOP:
+ return set_all_monitor_traces(TRACE_OFF);
+ }
+
+ return -ENOTSUPP;
+}
+
+static int dropmon_net_event(struct notifier_block *ev_block,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct dm_hw_stat_delta *new_stat = NULL;
+ struct dm_hw_stat_delta *tmp;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+
+ if (!new_stat)
+ goto out;
+
+ new_stat->dev = dev;
+ new_stat->last_rx = jiffies;
+ mutex_lock(&trace_state_mutex);
+ list_add_rcu(&new_stat->list, &hw_stats_list);
+ mutex_unlock(&trace_state_mutex);
+ break;
+ case NETDEV_UNREGISTER:
+ mutex_lock(&trace_state_mutex);
+ list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
+ if (new_stat->dev == dev) {
+
+ /* Paired with READ_ONCE() in trace_napi_poll_hit() */
+ WRITE_ONCE(new_stat->dev, NULL);
+
+ if (trace_state == TRACE_OFF) {
+ list_del_rcu(&new_stat->list);
+ kfree_rcu(new_stat, rcu);
+ break;
+ }
+ }
+ }
+ mutex_unlock(&trace_state_mutex);
+ break;
+ }
+out:
+ return NOTIFY_DONE;
+}
+
+static const struct genl_ops dropmon_ops[] = {
+ {
+ .cmd = NET_DM_CMD_CONFIG,
+ .doit = net_dm_cmd_config,
+ },
+ {
+ .cmd = NET_DM_CMD_START,
+ .doit = net_dm_cmd_trace,
+ },
+ {
+ .cmd = NET_DM_CMD_STOP,
+ .doit = net_dm_cmd_trace,
+ },
+};
+
+static struct genl_family net_drop_monitor_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = "NET_DM",
+ .version = 2,
+ .module = THIS_MODULE,
+ .ops = dropmon_ops,
+ .n_ops = ARRAY_SIZE(dropmon_ops),
+ .mcgrps = dropmon_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps),
+};
+
+static struct notifier_block dropmon_net_notifier = {
+ .notifier_call = dropmon_net_event
+};
+
+static int __init init_net_drop_monitor(void)
+{
+ struct per_cpu_dm_data *data;
+ int cpu, rc;
+
+ pr_info("Initializing network drop monitor service\n");
+
+ if (sizeof(void *) > 8) {
+ pr_err("Unable to store program counters on this arch, Drop monitor failed\n");
+ return -ENOSPC;
+ }
+
+ rc = genl_register_family(&net_drop_monitor_family);
+ if (rc) {
+ pr_err("Could not create drop monitor netlink family\n");
+ return rc;
+ }
+ WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
+
+ rc = register_netdevice_notifier(&dropmon_net_notifier);
+ if (rc < 0) {
+ pr_crit("Failed to register netdevice notifier\n");
+ goto out_unreg;
+ }
+
+ rc = 0;
+
+ for_each_possible_cpu(cpu) {
+ data = &per_cpu(dm_cpu_data, cpu);
+ INIT_WORK(&data->dm_alert_work, send_dm_alert);
+ timer_setup(&data->send_timer, sched_send_work, 0);
+ spin_lock_init(&data->lock);
+ reset_per_cpu_data(data);
+ }
+
+
+ goto out;
+
+out_unreg:
+ genl_unregister_family(&net_drop_monitor_family);
+out:
+ return rc;
+}
+
+static void exit_net_drop_monitor(void)
+{
+ struct per_cpu_dm_data *data;
+ int cpu;
+
+ BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
+
+ /*
+ * Because of the module_get/put we do in the trace state change path
+ * we are guarnateed not to have any current users when we get here
+ * all we need to do is make sure that we don't have any running timers
+ * or pending schedule calls
+ */
+
+ for_each_possible_cpu(cpu) {
+ data = &per_cpu(dm_cpu_data, cpu);
+ del_timer_sync(&data->send_timer);
+ cancel_work_sync(&data->dm_alert_work);
+ /*
+ * At this point, we should have exclusive access
+ * to this struct and can free the skb inside it
+ */
+ kfree_skb(data->skb);
+ }
+
+ BUG_ON(genl_unregister_family(&net_drop_monitor_family));
+}
+
+module_init(init_net_drop_monitor);
+module_exit(exit_net_drop_monitor);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>");
+MODULE_ALIAS_GENL_FAMILY("NET_DM");
diff --git a/net/core/dst.c b/net/core/dst.c
new file mode 100644
index 000000000..81ccf20e2
--- /dev/null
+++ b/net/core/dst.c
@@ -0,0 +1,344 @@
+/*
+ * net/core/dst.c Protocol independent destination cache.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <linux/sched.h>
+#include <linux/prefetch.h>
+#include <net/lwtunnel.h>
+#include <net/xfrm.h>
+
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+
+/*
+ * Theory of operations:
+ * 1) We use a list, protected by a spinlock, to add
+ * new entries from both BH and non-BH context.
+ * 2) In order to keep spinlock held for a small delay,
+ * we use a second list where are stored long lived
+ * entries, that are handled by the garbage collect thread
+ * fired by a workqueue.
+ * 3) This list is guarded by a mutex,
+ * so that the gc_task and dst_dev_event() can be synchronized.
+ */
+
+/*
+ * We want to keep lock & list close together
+ * to dirty as few cache lines as possible in __dst_free().
+ * As this is not a very strong hint, we dont force an alignment on SMP.
+ */
+int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL(dst_discard_out);
+
+const struct dst_metrics dst_default_metrics = {
+ /* This initializer is needed to force linker to place this variable
+ * into const section. Otherwise it might end into bss section.
+ * We really want to avoid false sharing on this variable, and catch
+ * any writes on it.
+ */
+ .refcnt = REFCOUNT_INIT(1),
+};
+EXPORT_SYMBOL(dst_default_metrics);
+
+void dst_init(struct dst_entry *dst, struct dst_ops *ops,
+ struct net_device *dev, int initial_ref, int initial_obsolete,
+ unsigned short flags)
+{
+ dst->dev = dev;
+ if (dev)
+ dev_hold(dev);
+ dst->ops = ops;
+ dst_init_metrics(dst, dst_default_metrics.metrics, true);
+ dst->expires = 0UL;
+#ifdef CONFIG_XFRM
+ dst->xfrm = NULL;
+#endif
+ dst->input = dst_discard;
+ dst->output = dst_discard_out;
+ dst->error = 0;
+ dst->obsolete = initial_obsolete;
+ dst->header_len = 0;
+ dst->trailer_len = 0;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ dst->tclassid = 0;
+#endif
+ dst->lwtstate = NULL;
+ atomic_set(&dst->__refcnt, initial_ref);
+ dst->__use = 0;
+ dst->lastuse = jiffies;
+ dst->flags = flags;
+ if (!(flags & DST_NOCOUNT))
+ dst_entries_add(ops, 1);
+}
+EXPORT_SYMBOL(dst_init);
+
+void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
+ int initial_ref, int initial_obsolete, unsigned short flags)
+{
+ struct dst_entry *dst;
+
+ if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
+ if (ops->gc(ops))
+ return NULL;
+ }
+
+ dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
+ if (!dst)
+ return NULL;
+
+ dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);
+
+ return dst;
+}
+EXPORT_SYMBOL(dst_alloc);
+
+struct dst_entry *dst_destroy(struct dst_entry * dst)
+{
+ struct dst_entry *child = NULL;
+
+ smp_rmb();
+
+#ifdef CONFIG_XFRM
+ if (dst->xfrm) {
+ struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
+
+ child = xdst->child;
+ }
+#endif
+ if (!(dst->flags & DST_NOCOUNT))
+ dst_entries_add(dst->ops, -1);
+
+ if (dst->ops->destroy)
+ dst->ops->destroy(dst);
+ if (dst->dev)
+ dev_put(dst->dev);
+
+ lwtstate_put(dst->lwtstate);
+
+ if (dst->flags & DST_METADATA)
+ metadata_dst_free((struct metadata_dst *)dst);
+ else
+ kmem_cache_free(dst->ops->kmem_cachep, dst);
+
+ dst = child;
+ if (dst)
+ dst_release_immediate(dst);
+ return NULL;
+}
+EXPORT_SYMBOL(dst_destroy);
+
+static void dst_destroy_rcu(struct rcu_head *head)
+{
+ struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
+
+ dst = dst_destroy(dst);
+}
+
+/* Operations to mark dst as DEAD and clean up the net device referenced
+ * by dst:
+ * 1. put the dst under loopback interface and discard all tx/rx packets
+ * on this route.
+ * 2. release the net_device
+ * This function should be called when removing routes from the fib tree
+ * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to
+ * make the next dst_ops->check() fail.
+ */
+void dst_dev_put(struct dst_entry *dst)
+{
+ struct net_device *dev = dst->dev;
+
+ dst->obsolete = DST_OBSOLETE_DEAD;
+ if (dst->ops->ifdown)
+ dst->ops->ifdown(dst, dev, true);
+ dst->input = dst_discard;
+ dst->output = dst_discard_out;
+ dst->dev = dev_net(dst->dev)->loopback_dev;
+ dev_hold(dst->dev);
+ dev_put(dev);
+}
+EXPORT_SYMBOL(dst_dev_put);
+
+void dst_release(struct dst_entry *dst)
+{
+ if (dst) {
+ int newrefcnt;
+
+ newrefcnt = atomic_dec_return(&dst->__refcnt);
+ if (unlikely(newrefcnt < 0))
+ net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
+ __func__, dst, newrefcnt);
+ if (!newrefcnt)
+ call_rcu(&dst->rcu_head, dst_destroy_rcu);
+ }
+}
+EXPORT_SYMBOL(dst_release);
+
+void dst_release_immediate(struct dst_entry *dst)
+{
+ if (dst) {
+ int newrefcnt;
+
+ newrefcnt = atomic_dec_return(&dst->__refcnt);
+ if (unlikely(newrefcnt < 0))
+ net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
+ __func__, dst, newrefcnt);
+ if (!newrefcnt)
+ dst_destroy(dst);
+ }
+}
+EXPORT_SYMBOL(dst_release_immediate);
+
+u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+ struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);
+
+ if (p) {
+ struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);
+ unsigned long prev, new;
+
+ refcount_set(&p->refcnt, 1);
+ memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));
+
+ new = (unsigned long) p;
+ prev = cmpxchg(&dst->_metrics, old, new);
+
+ if (prev != old) {
+ kfree(p);
+ p = (struct dst_metrics *)__DST_METRICS_PTR(prev);
+ if (prev & DST_METRICS_READ_ONLY)
+ p = NULL;
+ } else if (prev & DST_METRICS_REFCOUNTED) {
+ if (refcount_dec_and_test(&old_p->refcnt))
+ kfree(old_p);
+ }
+ }
+ BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0);
+ return (u32 *)p;
+}
+EXPORT_SYMBOL(dst_cow_metrics_generic);
+
+/* Caller asserts that dst_metrics_read_only(dst) is false. */
+void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+ unsigned long prev, new;
+
+ new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY;
+ prev = cmpxchg(&dst->_metrics, old, new);
+ if (prev == old)
+ kfree(__DST_METRICS_PTR(old));
+}
+EXPORT_SYMBOL(__dst_destroy_metrics_generic);
+
+static struct dst_ops md_dst_ops = {
+ .family = AF_UNSPEC,
+};
+
+static int dst_md_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ WARN_ONCE(1, "Attempting to call output on metadata dst\n");
+ kfree_skb(skb);
+ return 0;
+}
+
+static int dst_md_discard(struct sk_buff *skb)
+{
+ WARN_ONCE(1, "Attempting to call input on metadata dst\n");
+ kfree_skb(skb);
+ return 0;
+}
+
+static void __metadata_dst_init(struct metadata_dst *md_dst,
+ enum metadata_type type, u8 optslen)
+
+{
+ struct dst_entry *dst;
+
+ dst = &md_dst->dst;
+ dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
+ DST_METADATA | DST_NOCOUNT);
+
+ dst->input = dst_md_discard;
+ dst->output = dst_md_discard_out;
+
+ memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
+ md_dst->type = type;
+}
+
+struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
+ gfp_t flags)
+{
+ struct metadata_dst *md_dst;
+
+ md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
+ if (!md_dst)
+ return NULL;
+
+ __metadata_dst_init(md_dst, type, optslen);
+
+ return md_dst;
+}
+EXPORT_SYMBOL_GPL(metadata_dst_alloc);
+
+void metadata_dst_free(struct metadata_dst *md_dst)
+{
+#ifdef CONFIG_DST_CACHE
+ if (md_dst->type == METADATA_IP_TUNNEL)
+ dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
+#endif
+ kfree(md_dst);
+}
+EXPORT_SYMBOL_GPL(metadata_dst_free);
+
+struct metadata_dst __percpu *
+metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
+{
+ int cpu;
+ struct metadata_dst __percpu *md_dst;
+
+ md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen,
+ __alignof__(struct metadata_dst), flags);
+ if (!md_dst)
+ return NULL;
+
+ for_each_possible_cpu(cpu)
+ __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);
+
+ return md_dst;
+}
+EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
+
+void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
+{
+#ifdef CONFIG_DST_CACHE
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);
+
+ if (one_md_dst->type == METADATA_IP_TUNNEL)
+ dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
+ }
+#endif
+ free_percpu(md_dst);
+}
+EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
new file mode 100644
index 000000000..64cef9774
--- /dev/null
+++ b/net/core/dst_cache.c
@@ -0,0 +1,168 @@
+/*
+ * net/core/dst_cache.c - dst entry cache
+ *
+ * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <net/dst_cache.h>
+#include <net/route.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_fib.h>
+#endif
+#include <uapi/linux/in.h>
+
+struct dst_cache_pcpu {
+ unsigned long refresh_ts;
+ struct dst_entry *dst;
+ u32 cookie;
+ union {
+ struct in_addr in_saddr;
+ struct in6_addr in6_saddr;
+ };
+};
+
+static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache,
+ struct dst_entry *dst, u32 cookie)
+{
+ dst_release(dst_cache->dst);
+ if (dst)
+ dst_hold(dst);
+
+ dst_cache->cookie = cookie;
+ dst_cache->dst = dst;
+}
+
+static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
+ struct dst_cache_pcpu *idst)
+{
+ struct dst_entry *dst;
+
+ dst = idst->dst;
+ if (!dst)
+ goto fail;
+
+ /* the cache already hold a dst reference; it can't go away */
+ dst_hold(dst);
+
+ if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) ||
+ (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) {
+ dst_cache_per_cpu_dst_set(idst, NULL, 0);
+ dst_release(dst);
+ goto fail;
+ }
+ return dst;
+
+fail:
+ idst->refresh_ts = jiffies;
+ return NULL;
+}
+
+struct dst_entry *dst_cache_get(struct dst_cache *dst_cache)
+{
+ if (!dst_cache->cache)
+ return NULL;
+
+ return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+}
+EXPORT_SYMBOL_GPL(dst_cache_get);
+
+struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)
+{
+ struct dst_cache_pcpu *idst;
+ struct dst_entry *dst;
+
+ if (!dst_cache->cache)
+ return NULL;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst = dst_cache_per_cpu_get(dst_cache, idst);
+ if (!dst)
+ return NULL;
+
+ *saddr = idst->in_saddr.s_addr;
+ return container_of(dst, struct rtable, dst);
+}
+EXPORT_SYMBOL_GPL(dst_cache_get_ip4);
+
+void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
+ __be32 saddr)
+{
+ struct dst_cache_pcpu *idst;
+
+ if (!dst_cache->cache)
+ return;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst_cache_per_cpu_dst_set(idst, dst, 0);
+ idst->in_saddr.s_addr = saddr;
+}
+EXPORT_SYMBOL_GPL(dst_cache_set_ip4);
+
+#if IS_ENABLED(CONFIG_IPV6)
+void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
+ const struct in6_addr *saddr)
+{
+ struct dst_cache_pcpu *idst;
+
+ if (!dst_cache->cache)
+ return;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst,
+ rt6_get_cookie((struct rt6_info *)dst));
+ idst->in6_saddr = *saddr;
+}
+EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
+
+struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
+ struct in6_addr *saddr)
+{
+ struct dst_cache_pcpu *idst;
+ struct dst_entry *dst;
+
+ if (!dst_cache->cache)
+ return NULL;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst = dst_cache_per_cpu_get(dst_cache, idst);
+ if (!dst)
+ return NULL;
+
+ *saddr = idst->in6_saddr;
+ return dst;
+}
+EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
+#endif
+
+int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp)
+{
+ dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu,
+ gfp | __GFP_ZERO);
+ if (!dst_cache->cache)
+ return -ENOMEM;
+
+ dst_cache_reset(dst_cache);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dst_cache_init);
+
+void dst_cache_destroy(struct dst_cache *dst_cache)
+{
+ int i;
+
+ if (!dst_cache->cache)
+ return;
+
+ for_each_possible_cpu(i)
+ dst_release(per_cpu_ptr(dst_cache->cache, i)->dst);
+
+ free_percpu(dst_cache->cache);
+}
+EXPORT_SYMBOL_GPL(dst_cache_destroy);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
new file mode 100644
index 000000000..4db9512fe
--- /dev/null
+++ b/net/core/ethtool.c
@@ -0,0 +1,2916 @@
+/*
+ * net/core/ethtool.c - Ethtool ioctl handler
+ * Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx>
+ *
+ * This file is where we call all the ethtool_ops commands to get
+ * the information ethtool needs.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/net_tstamp.h>
+#include <linux/phy.h>
+#include <linux/bitops.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/sfp.h>
+#include <linux/slab.h>
+#include <linux/rtnetlink.h>
+#include <linux/sched/signal.h>
+#include <linux/net.h>
+
+/*
+ * Some useful ethtool_ops methods that're device independent.
+ * If we find that all drivers want to do the same thing here,
+ * we can turn these into dev_() function calls.
+ */
+
+u32 ethtool_op_get_link(struct net_device *dev)
+{
+ return netif_carrier_ok(dev) ? 1 : 0;
+}
+EXPORT_SYMBOL(ethtool_op_get_link);
+
+int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
+{
+ info->so_timestamping =
+ SOF_TIMESTAMPING_TX_SOFTWARE |
+ SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info->phc_index = -1;
+ return 0;
+}
+EXPORT_SYMBOL(ethtool_op_get_ts_info);
+
+/* Handlers for each ethtool command */
+
+#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32)
+
+static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
+ [NETIF_F_SG_BIT] = "tx-scatter-gather",
+ [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4",
+ [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic",
+ [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6",
+ [NETIF_F_HIGHDMA_BIT] = "highdma",
+ [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist",
+ [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert",
+
+ [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse",
+ [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter",
+ [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert",
+ [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse",
+ [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter",
+ [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged",
+ [NETIF_F_GSO_BIT] = "tx-generic-segmentation",
+ [NETIF_F_LLTX_BIT] = "tx-lockless",
+ [NETIF_F_NETNS_LOCAL_BIT] = "netns-local",
+ [NETIF_F_GRO_BIT] = "rx-gro",
+ [NETIF_F_GRO_HW_BIT] = "rx-gro-hw",
+ [NETIF_F_LRO_BIT] = "rx-lro",
+
+ [NETIF_F_TSO_BIT] = "tx-tcp-segmentation",
+ [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust",
+ [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation",
+ [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation",
+ [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
+ [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
+ [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation",
+ [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation",
+ [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation",
+ [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation",
+ [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
+ [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
+ [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
+ [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation",
+ [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation",
+ [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation",
+
+ [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
+ [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
+ [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu",
+ [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter",
+ [NETIF_F_RXHASH_BIT] = "rx-hashing",
+ [NETIF_F_RXCSUM_BIT] = "rx-checksum",
+ [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy",
+ [NETIF_F_LOOPBACK_BIT] = "loopback",
+ [NETIF_F_RXFCS_BIT] = "rx-fcs",
+ [NETIF_F_RXALL_BIT] = "rx-all",
+ [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
+ [NETIF_F_HW_TC_BIT] = "hw-tc-offload",
+ [NETIF_F_HW_ESP_BIT] = "esp-hw-offload",
+ [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload",
+ [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
+ [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
+ [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
+ [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
+};
+
+static const char
+rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = {
+ [ETH_RSS_HASH_TOP_BIT] = "toeplitz",
+ [ETH_RSS_HASH_XOR_BIT] = "xor",
+ [ETH_RSS_HASH_CRC32_BIT] = "crc32",
+};
+
+static const char
+tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_ID_UNSPEC] = "Unspec",
+ [ETHTOOL_RX_COPYBREAK] = "rx-copybreak",
+ [ETHTOOL_TX_COPYBREAK] = "tx-copybreak",
+ [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout",
+};
+
+static const char
+phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_ID_UNSPEC] = "Unspec",
+ [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift",
+};
+
+static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_gfeatures cmd = {
+ .cmd = ETHTOOL_GFEATURES,
+ .size = ETHTOOL_DEV_FEATURE_WORDS,
+ };
+ struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 __user *sizeaddr;
+ u32 copy_size;
+ int i;
+
+ /* in case feature bits run out again */
+ BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t));
+
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
+ features[i].available = (u32)(dev->hw_features >> (32 * i));
+ features[i].requested = (u32)(dev->wanted_features >> (32 * i));
+ features[i].active = (u32)(dev->features >> (32 * i));
+ features[i].never_changed =
+ (u32)(NETIF_F_NEVER_CHANGE >> (32 * i));
+ }
+
+ sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size);
+ if (get_user(copy_size, sizeaddr))
+ return -EFAULT;
+
+ if (copy_size > ETHTOOL_DEV_FEATURE_WORDS)
+ copy_size = ETHTOOL_DEV_FEATURE_WORDS;
+
+ if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
+ return -EFAULT;
+ useraddr += sizeof(cmd);
+ if (copy_to_user(useraddr, features, copy_size * sizeof(*features)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_sfeatures cmd;
+ struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
+ netdev_features_t wanted = 0, valid = 0;
+ int i, ret = 0;
+
+ if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+ return -EFAULT;
+ useraddr += sizeof(cmd);
+
+ if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS)
+ return -EINVAL;
+
+ if (copy_from_user(features, useraddr, sizeof(features)))
+ return -EFAULT;
+
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
+ valid |= (netdev_features_t)features[i].valid << (32 * i);
+ wanted |= (netdev_features_t)features[i].requested << (32 * i);
+ }
+
+ if (valid & ~NETIF_F_ETHTOOL_BITS)
+ return -EINVAL;
+
+ if (valid & ~dev->hw_features) {
+ valid &= dev->hw_features;
+ ret |= ETHTOOL_F_UNSUPPORTED;
+ }
+
+ dev->wanted_features &= ~valid;
+ dev->wanted_features |= wanted & valid;
+ __netdev_update_features(dev);
+
+ if ((dev->wanted_features ^ dev->features) & valid)
+ ret |= ETHTOOL_F_WISH;
+
+ return ret;
+}
+
+static int __ethtool_get_sset_count(struct net_device *dev, int sset)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (sset == ETH_SS_FEATURES)
+ return ARRAY_SIZE(netdev_features_strings);
+
+ if (sset == ETH_SS_RSS_HASH_FUNCS)
+ return ARRAY_SIZE(rss_hash_func_strings);
+
+ if (sset == ETH_SS_TUNABLES)
+ return ARRAY_SIZE(tunable_strings);
+
+ if (sset == ETH_SS_PHY_TUNABLES)
+ return ARRAY_SIZE(phy_tunable_strings);
+
+ if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ return phy_ethtool_get_sset_count(dev->phydev);
+
+ if (ops->get_sset_count && ops->get_strings)
+ return ops->get_sset_count(dev, sset);
+ else
+ return -EOPNOTSUPP;
+}
+
+static void __ethtool_get_strings(struct net_device *dev,
+ u32 stringset, u8 *data)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (stringset == ETH_SS_FEATURES)
+ memcpy(data, netdev_features_strings,
+ sizeof(netdev_features_strings));
+ else if (stringset == ETH_SS_RSS_HASH_FUNCS)
+ memcpy(data, rss_hash_func_strings,
+ sizeof(rss_hash_func_strings));
+ else if (stringset == ETH_SS_TUNABLES)
+ memcpy(data, tunable_strings, sizeof(tunable_strings));
+ else if (stringset == ETH_SS_PHY_TUNABLES)
+ memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
+ else if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ phy_ethtool_get_strings(dev->phydev, data);
+ else
+ /* ops->get_strings is valid because checked earlier */
+ ops->get_strings(dev, stringset, data);
+}
+
+static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd)
+{
+ /* feature masks of legacy discrete ethtool ops */
+
+ switch (eth_cmd) {
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_STXCSUM:
+ return NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC;
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_SRXCSUM:
+ return NETIF_F_RXCSUM;
+ case ETHTOOL_GSG:
+ case ETHTOOL_SSG:
+ return NETIF_F_SG;
+ case ETHTOOL_GTSO:
+ case ETHTOOL_STSO:
+ return NETIF_F_ALL_TSO;
+ case ETHTOOL_GGSO:
+ case ETHTOOL_SGSO:
+ return NETIF_F_GSO;
+ case ETHTOOL_GGRO:
+ case ETHTOOL_SGRO:
+ return NETIF_F_GRO;
+ default:
+ BUG();
+ }
+}
+
+static int ethtool_get_one_feature(struct net_device *dev,
+ char __user *useraddr, u32 ethcmd)
+{
+ netdev_features_t mask = ethtool_get_feature_mask(ethcmd);
+ struct ethtool_value edata = {
+ .cmd = ethcmd,
+ .data = !!(dev->features & mask),
+ };
+
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_one_feature(struct net_device *dev,
+ void __user *useraddr, u32 ethcmd)
+{
+ struct ethtool_value edata;
+ netdev_features_t mask;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ mask = ethtool_get_feature_mask(ethcmd);
+ mask &= dev->hw_features;
+ if (!mask)
+ return -EOPNOTSUPP;
+
+ if (edata.data)
+ dev->wanted_features |= mask;
+ else
+ dev->wanted_features &= ~mask;
+
+ __netdev_update_features(dev);
+
+ return 0;
+}
+
+#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \
+ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH)
+#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \
+ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \
+ NETIF_F_RXHASH)
+
+static u32 __ethtool_get_flags(struct net_device *dev)
+{
+ u32 flags = 0;
+
+ if (dev->features & NETIF_F_LRO)
+ flags |= ETH_FLAG_LRO;
+ if (dev->features & NETIF_F_HW_VLAN_CTAG_RX)
+ flags |= ETH_FLAG_RXVLAN;
+ if (dev->features & NETIF_F_HW_VLAN_CTAG_TX)
+ flags |= ETH_FLAG_TXVLAN;
+ if (dev->features & NETIF_F_NTUPLE)
+ flags |= ETH_FLAG_NTUPLE;
+ if (dev->features & NETIF_F_RXHASH)
+ flags |= ETH_FLAG_RXHASH;
+
+ return flags;
+}
+
+static int __ethtool_set_flags(struct net_device *dev, u32 data)
+{
+ netdev_features_t features = 0, changed;
+
+ if (data & ~ETH_ALL_FLAGS)
+ return -EINVAL;
+
+ if (data & ETH_FLAG_LRO)
+ features |= NETIF_F_LRO;
+ if (data & ETH_FLAG_RXVLAN)
+ features |= NETIF_F_HW_VLAN_CTAG_RX;
+ if (data & ETH_FLAG_TXVLAN)
+ features |= NETIF_F_HW_VLAN_CTAG_TX;
+ if (data & ETH_FLAG_NTUPLE)
+ features |= NETIF_F_NTUPLE;
+ if (data & ETH_FLAG_RXHASH)
+ features |= NETIF_F_RXHASH;
+
+ /* allow changing only bits set in hw_features */
+ changed = (features ^ dev->features) & ETH_ALL_FEATURES;
+ if (changed & ~dev->hw_features)
+ return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;
+
+ dev->wanted_features =
+ (dev->wanted_features & ~changed) | (features & changed);
+
+ __netdev_update_features(dev);
+
+ return 0;
+}
+
+/* Given two link masks, AND them together and save the result in dst. */
+void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst,
+ struct ethtool_link_ksettings *src)
+{
+ unsigned int size = BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS);
+ unsigned int idx = 0;
+
+ for (; idx < size; idx++) {
+ dst->link_modes.supported[idx] &=
+ src->link_modes.supported[idx];
+ dst->link_modes.advertising[idx] &=
+ src->link_modes.advertising[idx];
+ }
+}
+EXPORT_SYMBOL(ethtool_intersect_link_masks);
+
+void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
+ u32 legacy_u32)
+{
+ bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ dst[0] = legacy_u32;
+}
+EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode);
+
+/* return false if src had higher bits set. lower bits always updated. */
+bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
+ const unsigned long *src)
+{
+ bool retval = true;
+
+ /* TODO: following test will soon always be true */
+ if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) {
+ __ETHTOOL_DECLARE_LINK_MODE_MASK(ext);
+
+ bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_fill(ext, 32);
+ bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ if (bitmap_intersects(ext, src,
+ __ETHTOOL_LINK_MODE_MASK_NBITS)) {
+ /* src mask goes beyond bit 31 */
+ retval = false;
+ }
+ }
+ *legacy_u32 = src[0];
+ return retval;
+}
+EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32);
+
+/* return false if legacy contained non-0 deprecated fields
+ * maxtxpkt/maxrxpkt. rest of ksettings always updated
+ */
+static bool
+convert_legacy_settings_to_link_ksettings(
+ struct ethtool_link_ksettings *link_ksettings,
+ const struct ethtool_cmd *legacy_settings)
+{
+ bool retval = true;
+
+ memset(link_ksettings, 0, sizeof(*link_ksettings));
+
+ /* This is used to tell users that driver is still using these
+ * deprecated legacy fields, and they should not use
+ * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS
+ */
+ if (legacy_settings->maxtxpkt ||
+ legacy_settings->maxrxpkt)
+ retval = false;
+
+ ethtool_convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.supported,
+ legacy_settings->supported);
+ ethtool_convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.advertising,
+ legacy_settings->advertising);
+ ethtool_convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.lp_advertising,
+ legacy_settings->lp_advertising);
+ link_ksettings->base.speed
+ = ethtool_cmd_speed(legacy_settings);
+ link_ksettings->base.duplex
+ = legacy_settings->duplex;
+ link_ksettings->base.port
+ = legacy_settings->port;
+ link_ksettings->base.phy_address
+ = legacy_settings->phy_address;
+ link_ksettings->base.autoneg
+ = legacy_settings->autoneg;
+ link_ksettings->base.mdio_support
+ = legacy_settings->mdio_support;
+ link_ksettings->base.eth_tp_mdix
+ = legacy_settings->eth_tp_mdix;
+ link_ksettings->base.eth_tp_mdix_ctrl
+ = legacy_settings->eth_tp_mdix_ctrl;
+ return retval;
+}
+
+/* return false if ksettings link modes had higher bits
+ * set. legacy_settings always updated (best effort)
+ */
+static bool
+convert_link_ksettings_to_legacy_settings(
+ struct ethtool_cmd *legacy_settings,
+ const struct ethtool_link_ksettings *link_ksettings)
+{
+ bool retval = true;
+
+ memset(legacy_settings, 0, sizeof(*legacy_settings));
+ /* this also clears the deprecated fields in legacy structure:
+ * __u8 transceiver;
+ * __u32 maxtxpkt;
+ * __u32 maxrxpkt;
+ */
+
+ retval &= ethtool_convert_link_mode_to_legacy_u32(
+ &legacy_settings->supported,
+ link_ksettings->link_modes.supported);
+ retval &= ethtool_convert_link_mode_to_legacy_u32(
+ &legacy_settings->advertising,
+ link_ksettings->link_modes.advertising);
+ retval &= ethtool_convert_link_mode_to_legacy_u32(
+ &legacy_settings->lp_advertising,
+ link_ksettings->link_modes.lp_advertising);
+ ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed);
+ legacy_settings->duplex
+ = link_ksettings->base.duplex;
+ legacy_settings->port
+ = link_ksettings->base.port;
+ legacy_settings->phy_address
+ = link_ksettings->base.phy_address;
+ legacy_settings->autoneg
+ = link_ksettings->base.autoneg;
+ legacy_settings->mdio_support
+ = link_ksettings->base.mdio_support;
+ legacy_settings->eth_tp_mdix
+ = link_ksettings->base.eth_tp_mdix;
+ legacy_settings->eth_tp_mdix_ctrl
+ = link_ksettings->base.eth_tp_mdix_ctrl;
+ legacy_settings->transceiver
+ = link_ksettings->base.transceiver;
+ return retval;
+}
+
+/* number of 32-bit words to store the user's link mode bitmaps */
+#define __ETHTOOL_LINK_MODE_MASK_NU32 \
+ DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32)
+
+/* layout of the struct passed from/to userland */
+struct ethtool_link_usettings {
+ struct ethtool_link_settings base;
+ struct {
+ __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32];
+ __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+ __u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+ } link_modes;
+};
+
+/* Internal kernel helper to query a device ethtool_link_settings.
+ *
+ * Backward compatibility note: for compatibility with legacy drivers
+ * that implement only the ethtool_cmd API, this has to work with both
+ * drivers implementing get_link_ksettings API and drivers
+ * implementing get_settings API. When drivers implement get_settings
+ * and report ethtool_cmd deprecated fields
+ * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored
+ * because the resulting struct ethtool_link_settings does not report them.
+ */
+int __ethtool_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *link_ksettings)
+{
+ int err;
+ struct ethtool_cmd cmd;
+
+ ASSERT_RTNL();
+
+ if (dev->ethtool_ops->get_link_ksettings) {
+ memset(link_ksettings, 0, sizeof(*link_ksettings));
+ return dev->ethtool_ops->get_link_ksettings(dev,
+ link_ksettings);
+ }
+
+ /* driver doesn't support %ethtool_link_ksettings API. revert to
+ * legacy %ethtool_cmd API, unless it's not supported either.
+ * TODO: remove when ethtool_ops::get_settings disappears internally
+ */
+ if (!dev->ethtool_ops->get_settings)
+ return -EOPNOTSUPP;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.cmd = ETHTOOL_GSET;
+ err = dev->ethtool_ops->get_settings(dev, &cmd);
+ if (err < 0)
+ return err;
+
+ /* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt
+ */
+ convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd);
+ return err;
+}
+EXPORT_SYMBOL(__ethtool_get_link_ksettings);
+
+/* convert ethtool_link_usettings in user space to a kernel internal
+ * ethtool_link_ksettings. return 0 on success, errno on error.
+ */
+static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to,
+ const void __user *from)
+{
+ struct ethtool_link_usettings link_usettings;
+
+ if (copy_from_user(&link_usettings, from, sizeof(link_usettings)))
+ return -EFAULT;
+
+ memcpy(&to->base, &link_usettings.base, sizeof(to->base));
+ bitmap_from_arr32(to->link_modes.supported,
+ link_usettings.link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_from_arr32(to->link_modes.advertising,
+ link_usettings.link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_from_arr32(to->link_modes.lp_advertising,
+ link_usettings.link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+ return 0;
+}
+
+/* convert a kernel internal ethtool_link_ksettings to
+ * ethtool_link_usettings in user space. return 0 on success, errno on
+ * error.
+ */
+static int
+store_link_ksettings_for_user(void __user *to,
+ const struct ethtool_link_ksettings *from)
+{
+ struct ethtool_link_usettings link_usettings;
+
+ memcpy(&link_usettings, from, sizeof(link_usettings));
+ bitmap_to_arr32(link_usettings.link_modes.supported,
+ from->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_to_arr32(link_usettings.link_modes.advertising,
+ from->link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_to_arr32(link_usettings.link_modes.lp_advertising,
+ from->link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+ if (copy_to_user(to, &link_usettings, sizeof(link_usettings)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Query device for its ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::get_link_ksettings, even if legacy
+ * ethtool_ops::get_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_GSET for
+ * this driver, so that they can correctly access the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_get_link_ksettings(struct net_device *dev,
+ void __user *useraddr)
+{
+ int err = 0;
+ struct ethtool_link_ksettings link_ksettings;
+
+ ASSERT_RTNL();
+
+ if (!dev->ethtool_ops->get_link_ksettings)
+ return -EOPNOTSUPP;
+
+ /* handle bitmap nbits handshake */
+ if (copy_from_user(&link_ksettings.base, useraddr,
+ sizeof(link_ksettings.base)))
+ return -EFAULT;
+
+ if (__ETHTOOL_LINK_MODE_MASK_NU32
+ != link_ksettings.base.link_mode_masks_nwords) {
+ /* wrong link mode nbits requested */
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
+ /* send back number of words required as negative val */
+ compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX,
+ "need too many bits for link modes!");
+ link_ksettings.base.link_mode_masks_nwords
+ = -((s8)__ETHTOOL_LINK_MODE_MASK_NU32);
+
+ /* copy the base fields back to user, not the link
+ * mode bitmaps
+ */
+ if (copy_to_user(useraddr, &link_ksettings.base,
+ sizeof(link_ksettings.base)))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ /* handshake successful: user/kernel agree on
+ * link_mode_masks_nwords
+ */
+
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings);
+ if (err < 0)
+ return err;
+
+ /* make sure we tell the right values to user */
+ link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
+ link_ksettings.base.link_mode_masks_nwords
+ = __ETHTOOL_LINK_MODE_MASK_NU32;
+
+ return store_link_ksettings_for_user(useraddr, &link_ksettings);
+}
+
+/* Update device ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::set_link_ksettings, even if legacy
+ * ethtool_ops::set_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_SSET for
+ * this driver, so that they can correctly update the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_set_link_ksettings(struct net_device *dev,
+ void __user *useraddr)
+{
+ int err;
+ struct ethtool_link_ksettings link_ksettings;
+
+ ASSERT_RTNL();
+
+ if (!dev->ethtool_ops->set_link_ksettings)
+ return -EOPNOTSUPP;
+
+ /* make sure nbits field has expected value */
+ if (copy_from_user(&link_ksettings.base, useraddr,
+ sizeof(link_ksettings.base)))
+ return -EFAULT;
+
+ if (__ETHTOOL_LINK_MODE_MASK_NU32
+ != link_ksettings.base.link_mode_masks_nwords)
+ return -EINVAL;
+
+ /* copy the whole structure, now that we know it has expected
+ * format
+ */
+ err = load_link_ksettings_from_user(&link_ksettings, useraddr);
+ if (err)
+ return err;
+
+ /* re-check nwords field, just in case */
+ if (__ETHTOOL_LINK_MODE_MASK_NU32
+ != link_ksettings.base.link_mode_masks_nwords)
+ return -EINVAL;
+
+ return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+}
+
+/* Query device for its ethtool_cmd settings.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing get_link_ksettings
+ * API and drivers implementing get_settings API. When drivers
+ * implement get_link_ksettings and report higher link mode bits, a
+ * kernel warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, but the command is successful
+ * (only the lower link mode bits reported back to user).
+ */
+static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_cmd cmd;
+
+ ASSERT_RTNL();
+
+ if (dev->ethtool_ops->get_link_ksettings) {
+ /* First, use link_ksettings API if it is supported */
+ int err;
+ struct ethtool_link_ksettings link_ksettings;
+
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ err = dev->ethtool_ops->get_link_ksettings(dev,
+ &link_ksettings);
+ if (err < 0)
+ return err;
+ convert_link_ksettings_to_legacy_settings(&cmd,
+ &link_ksettings);
+
+ /* send a sensible cmd tag back to user */
+ cmd.cmd = ETHTOOL_GSET;
+ } else {
+ /* driver doesn't support %ethtool_link_ksettings
+ * API. revert to legacy %ethtool_cmd API, unless it's
+ * not supported either.
+ */
+ int err;
+
+ if (!dev->ethtool_ops->get_settings)
+ return -EOPNOTSUPP;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.cmd = ETHTOOL_GSET;
+ err = dev->ethtool_ops->get_settings(dev, &cmd);
+ if (err < 0)
+ return err;
+ }
+
+ if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Update device link settings with given ethtool_cmd.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing set_link_ksettings
+ * API and drivers implementing set_settings API. When drivers
+ * implement set_link_ksettings and user's request updates deprecated
+ * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
+ * warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, and the request is rejected.
+ */
+static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_cmd cmd;
+
+ ASSERT_RTNL();
+
+ if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+ return -EFAULT;
+
+ /* first, try new %ethtool_link_ksettings API. */
+ if (dev->ethtool_ops->set_link_ksettings) {
+ struct ethtool_link_ksettings link_ksettings;
+
+ if (!convert_legacy_settings_to_link_ksettings(&link_ksettings,
+ &cmd))
+ return -EINVAL;
+
+ link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS;
+ link_ksettings.base.link_mode_masks_nwords
+ = __ETHTOOL_LINK_MODE_MASK_NU32;
+ return dev->ethtool_ops->set_link_ksettings(dev,
+ &link_ksettings);
+ }
+
+ /* legacy %ethtool_cmd API */
+
+ /* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings
+ * disappears internally
+ */
+
+ if (!dev->ethtool_ops->set_settings)
+ return -EOPNOTSUPP;
+
+ return dev->ethtool_ops->set_settings(dev, &cmd);
+}
+
+static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_drvinfo info;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ memset(&info, 0, sizeof(info));
+ info.cmd = ETHTOOL_GDRVINFO;
+ if (ops->get_drvinfo) {
+ ops->get_drvinfo(dev, &info);
+ } else if (dev->dev.parent && dev->dev.parent->driver) {
+ strlcpy(info.bus_info, dev_name(dev->dev.parent),
+ sizeof(info.bus_info));
+ strlcpy(info.driver, dev->dev.parent->driver->name,
+ sizeof(info.driver));
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * this method of obtaining string set info is deprecated;
+ * Use ETHTOOL_GSSET_INFO instead.
+ */
+ if (ops->get_sset_count) {
+ int rc;
+
+ rc = ops->get_sset_count(dev, ETH_SS_TEST);
+ if (rc >= 0)
+ info.testinfo_len = rc;
+ rc = ops->get_sset_count(dev, ETH_SS_STATS);
+ if (rc >= 0)
+ info.n_stats = rc;
+ rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS);
+ if (rc >= 0)
+ info.n_priv_flags = rc;
+ }
+ if (ops->get_regs_len) {
+ int ret = ops->get_regs_len(dev);
+
+ if (ret > 0)
+ info.regdump_len = ret;
+ }
+
+ if (ops->get_eeprom_len)
+ info.eedump_len = ops->get_eeprom_len(dev);
+
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ return -EFAULT;
+ return 0;
+}
+
+static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_sset_info info;
+ u64 sset_mask;
+ int i, idx = 0, n_bits = 0, ret, rc;
+ u32 *info_buf = NULL;
+
+ if (copy_from_user(&info, useraddr, sizeof(info)))
+ return -EFAULT;
+
+ /* store copy of mask, because we zero struct later on */
+ sset_mask = info.sset_mask;
+ if (!sset_mask)
+ return 0;
+
+ /* calculate size of return buffer */
+ n_bits = hweight64(sset_mask);
+
+ memset(&info, 0, sizeof(info));
+ info.cmd = ETHTOOL_GSSET_INFO;
+
+ info_buf = kcalloc(n_bits, sizeof(u32), GFP_USER);
+ if (!info_buf)
+ return -ENOMEM;
+
+ /*
+ * fill return buffer based on input bitmask and successful
+ * get_sset_count return
+ */
+ for (i = 0; i < 64; i++) {
+ if (!(sset_mask & (1ULL << i)))
+ continue;
+
+ rc = __ethtool_get_sset_count(dev, i);
+ if (rc >= 0) {
+ info.sset_mask |= (1ULL << i);
+ info_buf[idx++] = rc;
+ }
+ }
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ goto out;
+
+ useraddr += offsetof(struct ethtool_sset_info, data);
+ if (copy_to_user(useraddr, info_buf, idx * sizeof(u32)))
+ goto out;
+
+ ret = 0;
+
+out:
+ kfree(info_buf);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
+ u32 cmd, void __user *useraddr)
+{
+ struct ethtool_rxnfc info;
+ size_t info_size = sizeof(info);
+ int rc;
+
+ if (!dev->ethtool_ops->set_rxnfc)
+ return -EOPNOTSUPP;
+
+ /* struct ethtool_rxnfc was originally defined for
+ * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data
+ * members. User-space might still be using that
+ * definition. */
+ if (cmd == ETHTOOL_SRXFH)
+ info_size = (offsetof(struct ethtool_rxnfc, data) +
+ sizeof(info.data));
+
+ if (copy_from_user(&info, useraddr, info_size))
+ return -EFAULT;
+
+ rc = dev->ethtool_ops->set_rxnfc(dev, &info);
+ if (rc)
+ return rc;
+
+ if (cmd == ETHTOOL_SRXCLSRLINS &&
+ copy_to_user(useraddr, &info, info_size))
+ return -EFAULT;
+
+ return 0;
+}
+
+static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
+ u32 cmd, void __user *useraddr)
+{
+ struct ethtool_rxnfc info;
+ size_t info_size = sizeof(info);
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int ret;
+ void *rule_buf = NULL;
+
+ if (!ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ /* struct ethtool_rxnfc was originally defined for
+ * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data
+ * members. User-space might still be using that
+ * definition. */
+ if (cmd == ETHTOOL_GRXFH)
+ info_size = (offsetof(struct ethtool_rxnfc, data) +
+ sizeof(info.data));
+
+ if (copy_from_user(&info, useraddr, info_size))
+ return -EFAULT;
+
+ /* If FLOW_RSS was requested then user-space must be using the
+ * new definition, as FLOW_RSS is newer.
+ */
+ if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) {
+ info_size = sizeof(info);
+ if (copy_from_user(&info, useraddr, info_size))
+ return -EFAULT;
+ /* Since malicious users may modify the original data,
+ * we need to check whether FLOW_RSS is still requested.
+ */
+ if (!(info.flow_type & FLOW_RSS))
+ return -EINVAL;
+ }
+
+ if (info.cmd != cmd)
+ return -EINVAL;
+
+ if (info.cmd == ETHTOOL_GRXCLSRLALL) {
+ if (info.rule_cnt > 0) {
+ if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
+ rule_buf = kcalloc(info.rule_cnt, sizeof(u32),
+ GFP_USER);
+ if (!rule_buf)
+ return -ENOMEM;
+ }
+ }
+
+ ret = ops->get_rxnfc(dev, &info, rule_buf);
+ if (ret < 0)
+ goto err_out;
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &info, info_size))
+ goto err_out;
+
+ if (rule_buf) {
+ useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
+ if (copy_to_user(useraddr, rule_buf,
+ info.rule_cnt * sizeof(u32)))
+ goto err_out;
+ }
+ ret = 0;
+
+err_out:
+ kfree(rule_buf);
+
+ return ret;
+}
+
+static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
+ struct ethtool_rxnfc *rx_rings,
+ u32 size)
+{
+ int i;
+
+ if (copy_from_user(indir, useraddr, size * sizeof(indir[0])))
+ return -EFAULT;
+
+ /* Validate ring indices */
+ for (i = 0; i < size; i++)
+ if (indir[i] >= rx_rings->data)
+ return -EINVAL;
+
+ return 0;
+}
+
+u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
+
+void netdev_rss_key_fill(void *buffer, size_t len)
+{
+ BUG_ON(len > sizeof(netdev_rss_key));
+ net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key));
+ memcpy(buffer, netdev_rss_key, len);
+}
+EXPORT_SYMBOL(netdev_rss_key_fill);
+
+static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max)
+{
+ u32 dev_size, current_max = 0;
+ u32 *indir;
+ int ret;
+
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->get_rxfh)
+ return -EOPNOTSUPP;
+ dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
+ return -EOPNOTSUPP;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
+ if (!indir)
+ return -ENOMEM;
+
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
+ if (ret)
+ goto out;
+
+ while (dev_size--)
+ current_max = max(current_max, indir[dev_size]);
+
+ *max = current_max;
+
+out:
+ kfree(indir);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
+ void __user *useraddr)
+{
+ u32 user_size, dev_size;
+ u32 *indir;
+ int ret;
+
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->get_rxfh)
+ return -EOPNOTSUPP;
+ dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&user_size,
+ useraddr + offsetof(struct ethtool_rxfh_indir, size),
+ sizeof(user_size)))
+ return -EFAULT;
+
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size),
+ &dev_size, sizeof(dev_size)))
+ return -EFAULT;
+
+ /* If the user buffer size is 0, this is just a query for the
+ * device table size. Otherwise, if it's smaller than the
+ * device table size it's an error.
+ */
+ if (user_size < dev_size)
+ return user_size == 0 ? 0 : -EINVAL;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
+ if (!indir)
+ return -ENOMEM;
+
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
+ if (ret)
+ goto out;
+
+ if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh_indir, ring_index[0]),
+ indir, dev_size * sizeof(indir[0])))
+ ret = -EFAULT;
+
+out:
+ kfree(indir);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_rxnfc rx_rings;
+ u32 user_size, dev_size, i;
+ u32 *indir;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int ret;
+ u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]);
+
+ if (!ops->get_rxfh_indir_size || !ops->set_rxfh ||
+ !ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ dev_size = ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&user_size,
+ useraddr + offsetof(struct ethtool_rxfh_indir, size),
+ sizeof(user_size)))
+ return -EFAULT;
+
+ if (user_size != 0 && user_size != dev_size)
+ return -EINVAL;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
+ if (!indir)
+ return -ENOMEM;
+
+ rx_rings.cmd = ETHTOOL_GRXRINGS;
+ ret = ops->get_rxnfc(dev, &rx_rings, NULL);
+ if (ret)
+ goto out;
+
+ if (user_size == 0) {
+ for (i = 0; i < dev_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ } else {
+ ret = ethtool_copy_validate_indir(indir,
+ useraddr + ringidx_offset,
+ &rx_rings,
+ dev_size);
+ if (ret)
+ goto out;
+ }
+
+ ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE);
+ if (ret)
+ goto out;
+
+ /* indicate whether rxfh was set to default */
+ if (user_size == 0)
+ dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+ else
+ dev->priv_flags |= IFF_RXFH_CONFIGURED;
+
+out:
+ kfree(indir);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u32 user_indir_size, user_key_size;
+ u32 dev_indir_size = 0, dev_key_size = 0;
+ struct ethtool_rxfh rxfh;
+ u32 total_size;
+ u32 indir_bytes;
+ u32 *indir = NULL;
+ u8 dev_hfunc = 0;
+ u8 *hkey = NULL;
+ u8 *rss_config;
+
+ if (!ops->get_rxfh)
+ return -EOPNOTSUPP;
+
+ if (ops->get_rxfh_indir_size)
+ dev_indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ dev_key_size = ops->get_rxfh_key_size(dev);
+
+ if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
+ return -EFAULT;
+ user_indir_size = rxfh.indir_size;
+ user_key_size = rxfh.key_size;
+
+ /* Check that reserved fields are 0 for now */
+ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32)
+ return -EINVAL;
+ /* Most drivers don't handle rss_context, check it's 0 as well */
+ if (rxfh.rss_context && !ops->get_rxfh_context)
+ return -EOPNOTSUPP;
+
+ rxfh.indir_size = dev_indir_size;
+ rxfh.key_size = dev_key_size;
+ if (copy_to_user(useraddr, &rxfh, sizeof(rxfh)))
+ return -EFAULT;
+
+ if ((user_indir_size && (user_indir_size != dev_indir_size)) ||
+ (user_key_size && (user_key_size != dev_key_size)))
+ return -EINVAL;
+
+ indir_bytes = user_indir_size * sizeof(indir[0]);
+ total_size = indir_bytes + user_key_size;
+ rss_config = kzalloc(total_size, GFP_USER);
+ if (!rss_config)
+ return -ENOMEM;
+
+ if (user_indir_size)
+ indir = (u32 *)rss_config;
+
+ if (user_key_size)
+ hkey = rss_config + indir_bytes;
+
+ if (rxfh.rss_context)
+ ret = dev->ethtool_ops->get_rxfh_context(dev, indir, hkey,
+ &dev_hfunc,
+ rxfh.rss_context);
+ else
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc);
+ if (ret)
+ goto out;
+
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc),
+ &dev_hfunc, sizeof(rxfh.hfunc))) {
+ ret = -EFAULT;
+ } else if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh, rss_config[0]),
+ rss_config, total_size)) {
+ ret = -EFAULT;
+ }
+out:
+ kfree(rss_config);
+
+ return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc rx_rings;
+ struct ethtool_rxfh rxfh;
+ u32 dev_indir_size = 0, dev_key_size = 0, i;
+ u32 *indir = NULL, indir_bytes = 0;
+ u8 *hkey = NULL;
+ u8 *rss_config;
+ u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
+ bool delete = false;
+
+ if (!ops->get_rxnfc || !ops->set_rxfh)
+ return -EOPNOTSUPP;
+
+ if (ops->get_rxfh_indir_size)
+ dev_indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ dev_key_size = ops->get_rxfh_key_size(dev);
+
+ if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
+ return -EFAULT;
+
+ /* Check that reserved fields are 0 for now */
+ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32)
+ return -EINVAL;
+ /* Most drivers don't handle rss_context, check it's 0 as well */
+ if (rxfh.rss_context && !ops->set_rxfh_context)
+ return -EOPNOTSUPP;
+
+ /* If either indir, hash key or function is valid, proceed further.
+ * Must request at least one change: indir size, hash key or function.
+ */
+ if ((rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE &&
+ rxfh.indir_size != dev_indir_size) ||
+ (rxfh.key_size && (rxfh.key_size != dev_key_size)) ||
+ (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE &&
+ rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE))
+ return -EINVAL;
+
+ if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+ indir_bytes = dev_indir_size * sizeof(indir[0]);
+
+ rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER);
+ if (!rss_config)
+ return -ENOMEM;
+
+ rx_rings.cmd = ETHTOOL_GRXRINGS;
+ ret = ops->get_rxnfc(dev, &rx_rings, NULL);
+ if (ret)
+ goto out;
+
+ /* rxfh.indir_size == 0 means reset the indir table to default (master
+ * context) or delete the context (other RSS contexts).
+ * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged.
+ */
+ if (rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) {
+ indir = (u32 *)rss_config;
+ ret = ethtool_copy_validate_indir(indir,
+ useraddr + rss_cfg_offset,
+ &rx_rings,
+ rxfh.indir_size);
+ if (ret)
+ goto out;
+ } else if (rxfh.indir_size == 0) {
+ if (rxfh.rss_context == 0) {
+ indir = (u32 *)rss_config;
+ for (i = 0; i < dev_indir_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ } else {
+ delete = true;
+ }
+ }
+
+ if (rxfh.key_size) {
+ hkey = rss_config + indir_bytes;
+ if (copy_from_user(hkey,
+ useraddr + rss_cfg_offset + indir_bytes,
+ rxfh.key_size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ if (rxfh.rss_context)
+ ret = ops->set_rxfh_context(dev, indir, hkey, rxfh.hfunc,
+ &rxfh.rss_context, delete);
+ else
+ ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
+ if (ret)
+ goto out;
+
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context),
+ &rxfh.rss_context, sizeof(rxfh.rss_context)))
+ ret = -EFAULT;
+
+ if (!rxfh.rss_context) {
+ /* indicate whether rxfh was set to default */
+ if (rxfh.indir_size == 0)
+ dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+ else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+ dev->priv_flags |= IFF_RXFH_CONFIGURED;
+ }
+
+out:
+ kfree(rss_config);
+ return ret;
+}
+
+static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_regs regs;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *regbuf;
+ int reglen, ret;
+
+ if (!ops->get_regs || !ops->get_regs_len)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&regs, useraddr, sizeof(regs)))
+ return -EFAULT;
+
+ reglen = ops->get_regs_len(dev);
+ if (reglen <= 0)
+ return reglen;
+
+ if (regs.len > reglen)
+ regs.len = reglen;
+
+ regbuf = NULL;
+ if (reglen) {
+ regbuf = vzalloc(reglen);
+ if (!regbuf)
+ return -ENOMEM;
+ }
+
+ if (regs.len < reglen)
+ reglen = regs.len;
+
+ ops->get_regs(dev, &regs, regbuf);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &regs, sizeof(regs)))
+ goto out;
+ useraddr += offsetof(struct ethtool_regs, data);
+ if (copy_to_user(useraddr, regbuf, reglen))
+ goto out;
+ ret = 0;
+
+ out:
+ vfree(regbuf);
+ return ret;
+}
+
+static int ethtool_reset(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value reset;
+ int ret;
+
+ if (!dev->ethtool_ops->reset)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&reset, useraddr, sizeof(reset)))
+ return -EFAULT;
+
+ ret = dev->ethtool_ops->reset(dev, &reset.data);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(useraddr, &reset, sizeof(reset)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_wolinfo wol;
+
+ if (!dev->ethtool_ops->get_wol)
+ return -EOPNOTSUPP;
+
+ memset(&wol, 0, sizeof(struct ethtool_wolinfo));
+ wol.cmd = ETHTOOL_GWOL;
+ dev->ethtool_ops->get_wol(dev, &wol);
+
+ if (copy_to_user(useraddr, &wol, sizeof(wol)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_wolinfo wol;
+ int ret;
+
+ if (!dev->ethtool_ops->set_wol)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&wol, useraddr, sizeof(wol)))
+ return -EFAULT;
+
+ ret = dev->ethtool_ops->set_wol(dev, &wol);
+ if (ret)
+ return ret;
+
+ dev->wol_enabled = !!wol.wolopts;
+
+ return 0;
+}
+
+static int ethtool_get_eee(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_eee edata;
+ int rc;
+
+ if (!dev->ethtool_ops->get_eee)
+ return -EOPNOTSUPP;
+
+ memset(&edata, 0, sizeof(struct ethtool_eee));
+ edata.cmd = ETHTOOL_GEEE;
+ rc = dev->ethtool_ops->get_eee(dev, &edata);
+
+ if (rc)
+ return rc;
+
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ethtool_set_eee(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_eee edata;
+
+ if (!dev->ethtool_ops->set_eee)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_eee(dev, &edata);
+}
+
+static int ethtool_nway_reset(struct net_device *dev)
+{
+ if (!dev->ethtool_ops->nway_reset)
+ return -EOPNOTSUPP;
+
+ return dev->ethtool_ops->nway_reset(dev);
+}
+
+static int ethtool_get_link(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value edata = { .cmd = ETHTOOL_GLINK };
+
+ if (!dev->ethtool_ops->get_link)
+ return -EOPNOTSUPP;
+
+ edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev);
+
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr,
+ int (*getter)(struct net_device *,
+ struct ethtool_eeprom *, u8 *),
+ u32 total_len)
+{
+ struct ethtool_eeprom eeprom;
+ void __user *userbuf = useraddr + sizeof(eeprom);
+ u32 bytes_remaining;
+ u8 *data;
+ int ret = 0;
+
+ if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
+ return -EFAULT;
+
+ /* Check for wrap and zero */
+ if (eeprom.offset + eeprom.len <= eeprom.offset)
+ return -EINVAL;
+
+ /* Check for exceeding total eeprom len */
+ if (eeprom.offset + eeprom.len > total_len)
+ return -EINVAL;
+
+ data = kzalloc(PAGE_SIZE, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ bytes_remaining = eeprom.len;
+ while (bytes_remaining > 0) {
+ eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE);
+
+ ret = getter(dev, &eeprom, data);
+ if (ret)
+ break;
+ if (copy_to_user(userbuf, data, eeprom.len)) {
+ ret = -EFAULT;
+ break;
+ }
+ userbuf += eeprom.len;
+ eeprom.offset += eeprom.len;
+ bytes_remaining -= eeprom.len;
+ }
+
+ eeprom.len = userbuf - (useraddr + sizeof(eeprom));
+ eeprom.offset -= eeprom.len;
+ if (copy_to_user(useraddr, &eeprom, sizeof(eeprom)))
+ ret = -EFAULT;
+
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->get_eeprom || !ops->get_eeprom_len ||
+ !ops->get_eeprom_len(dev))
+ return -EOPNOTSUPP;
+
+ return ethtool_get_any_eeprom(dev, useraddr, ops->get_eeprom,
+ ops->get_eeprom_len(dev));
+}
+
+static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_eeprom eeprom;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void __user *userbuf = useraddr + sizeof(eeprom);
+ u32 bytes_remaining;
+ u8 *data;
+ int ret = 0;
+
+ if (!ops->set_eeprom || !ops->get_eeprom_len ||
+ !ops->get_eeprom_len(dev))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
+ return -EFAULT;
+
+ /* Check for wrap and zero */
+ if (eeprom.offset + eeprom.len <= eeprom.offset)
+ return -EINVAL;
+
+ /* Check for exceeding total eeprom len */
+ if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
+ return -EINVAL;
+
+ data = kzalloc(PAGE_SIZE, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ bytes_remaining = eeprom.len;
+ while (bytes_remaining > 0) {
+ eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE);
+
+ if (copy_from_user(data, userbuf, eeprom.len)) {
+ ret = -EFAULT;
+ break;
+ }
+ ret = ops->set_eeprom(dev, &eeprom, data);
+ if (ret)
+ break;
+ userbuf += eeprom.len;
+ eeprom.offset += eeprom.len;
+ bytes_remaining -= eeprom.len;
+ }
+
+ kfree(data);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
+
+ if (!dev->ethtool_ops->get_coalesce)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_coalesce(dev, &coalesce);
+
+ if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
+ return -EFAULT;
+ return 0;
+}
+
+static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_coalesce coalesce;
+
+ if (!dev->ethtool_ops->set_coalesce)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&coalesce, useraddr, sizeof(coalesce)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_coalesce(dev, &coalesce);
+}
+
+static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM };
+
+ if (!dev->ethtool_ops->get_ringparam)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_ringparam(dev, &ringparam);
+
+ if (copy_to_user(useraddr, &ringparam, sizeof(ringparam)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM };
+
+ if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&ringparam, useraddr, sizeof(ringparam)))
+ return -EFAULT;
+
+ dev->ethtool_ops->get_ringparam(dev, &max);
+
+ /* ensure new ring parameters are within the maximums */
+ if (ringparam.rx_pending > max.rx_max_pending ||
+ ringparam.rx_mini_pending > max.rx_mini_max_pending ||
+ ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending ||
+ ringparam.tx_pending > max.tx_max_pending)
+ return -EINVAL;
+
+ return dev->ethtool_ops->set_ringparam(dev, &ringparam);
+}
+
+static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
+
+ if (!dev->ethtool_ops->get_channels)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_channels(dev, &channels);
+
+ if (copy_to_user(useraddr, &channels, sizeof(channels)))
+ return -EFAULT;
+ return 0;
+}
+
+static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS };
+ u32 max_rx_in_use = 0;
+
+ if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&channels, useraddr, sizeof(channels)))
+ return -EFAULT;
+
+ dev->ethtool_ops->get_channels(dev, &max);
+
+ /* ensure new counts are within the maximums */
+ if ((channels.rx_count > max.max_rx) ||
+ (channels.tx_count > max.max_tx) ||
+ (channels.combined_count > max.max_combined) ||
+ (channels.other_count > max.max_other))
+ return -EINVAL;
+
+ /* ensure the new Rx count fits within the configured Rx flow
+ * indirection table settings */
+ if (netif_is_rxfh_configured(dev) &&
+ !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) &&
+ (channels.combined_count + channels.rx_count) <= max_rx_in_use)
+ return -EINVAL;
+
+ return dev->ethtool_ops->set_channels(dev, &channels);
+}
+
+static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM };
+
+ if (!dev->ethtool_ops->get_pauseparam)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_pauseparam(dev, &pauseparam);
+
+ if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_pauseparam pauseparam;
+
+ if (!dev->ethtool_ops->set_pauseparam)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
+}
+
+static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_test test;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u64 *data;
+ int ret, test_len;
+
+ if (!ops->self_test || !ops->get_sset_count)
+ return -EOPNOTSUPP;
+
+ test_len = ops->get_sset_count(dev, ETH_SS_TEST);
+ if (test_len < 0)
+ return test_len;
+ WARN_ON(test_len == 0);
+
+ if (copy_from_user(&test, useraddr, sizeof(test)))
+ return -EFAULT;
+
+ test.len = test_len;
+ data = kcalloc(test_len, sizeof(u64), GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ ops->self_test(dev, &test, data);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &test, sizeof(test)))
+ goto out;
+ useraddr += sizeof(test);
+ if (copy_to_user(useraddr, data, test.len * sizeof(u64)))
+ goto out;
+ ret = 0;
+
+ out:
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_gstrings gstrings;
+ u8 *data;
+ int ret;
+
+ if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
+ return -EFAULT;
+
+ ret = __ethtool_get_sset_count(dev, gstrings.string_set);
+ if (ret < 0)
+ return ret;
+ if (ret > S32_MAX / ETH_GSTRING_LEN)
+ return -ENOMEM;
+ WARN_ON_ONCE(!ret);
+
+ gstrings.len = ret;
+
+ if (gstrings.len) {
+ data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN));
+ if (!data)
+ return -ENOMEM;
+
+ __ethtool_get_strings(dev, gstrings.string_set, data);
+ } else {
+ data = NULL;
+ }
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
+ goto out;
+ useraddr += sizeof(gstrings);
+ if (gstrings.len &&
+ copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
+ goto out;
+ ret = 0;
+
+out:
+ vfree(data);
+ return ret;
+}
+
+static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_value id;
+ static bool busy;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int rc;
+
+ if (!ops->set_phys_id)
+ return -EOPNOTSUPP;
+
+ if (busy)
+ return -EBUSY;
+
+ if (copy_from_user(&id, useraddr, sizeof(id)))
+ return -EFAULT;
+
+ rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE);
+ if (rc < 0)
+ return rc;
+
+ /* Drop the RTNL lock while waiting, but prevent reentry or
+ * removal of the device.
+ */
+ busy = true;
+ dev_hold(dev);
+ rtnl_unlock();
+
+ if (rc == 0) {
+ /* Driver will handle this itself */
+ schedule_timeout_interruptible(
+ id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT);
+ } else {
+ /* Driver expects to be called at twice the frequency in rc */
+ int n = rc * 2, i, interval = HZ / n;
+
+ /* Count down seconds */
+ do {
+ /* Count down iterations per second */
+ i = n;
+ do {
+ rtnl_lock();
+ rc = ops->set_phys_id(dev,
+ (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON);
+ rtnl_unlock();
+ if (rc)
+ break;
+ schedule_timeout_interruptible(interval);
+ } while (!signal_pending(current) && --i != 0);
+ } while (!signal_pending(current) &&
+ (id.data == 0 || --id.data != 0));
+ }
+
+ rtnl_lock();
+ dev_put(dev);
+ busy = false;
+
+ (void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE);
+ return rc;
+}
+
+static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_stats stats;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u64 *data;
+ int ret, n_stats;
+
+ if (!ops->get_ethtool_stats || !ops->get_sset_count)
+ return -EOPNOTSUPP;
+
+ n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
+ if (n_stats < 0)
+ return n_stats;
+ if (n_stats > S32_MAX / sizeof(u64))
+ return -ENOMEM;
+ WARN_ON_ONCE(!n_stats);
+ if (copy_from_user(&stats, useraddr, sizeof(stats)))
+ return -EFAULT;
+
+ stats.n_stats = n_stats;
+
+ if (n_stats) {
+ data = vzalloc(array_size(n_stats, sizeof(u64)));
+ if (!data)
+ return -ENOMEM;
+ ops->get_ethtool_stats(dev, &stats, data);
+ } else {
+ data = NULL;
+ }
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &stats, sizeof(stats)))
+ goto out;
+ useraddr += sizeof(stats);
+ if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
+ goto out;
+ ret = 0;
+
+ out:
+ vfree(data);
+ return ret;
+}
+
+static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
+ struct ethtool_stats stats;
+ u64 *data;
+ int ret, n_stats;
+
+ if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count))
+ return -EOPNOTSUPP;
+
+ if (dev->phydev && !ops->get_ethtool_phy_stats)
+ n_stats = phy_ethtool_get_sset_count(dev->phydev);
+ else
+ n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
+ if (n_stats < 0)
+ return n_stats;
+ if (n_stats > S32_MAX / sizeof(u64))
+ return -ENOMEM;
+ WARN_ON_ONCE(!n_stats);
+
+ if (copy_from_user(&stats, useraddr, sizeof(stats)))
+ return -EFAULT;
+
+ stats.n_stats = n_stats;
+
+ if (n_stats) {
+ data = vzalloc(array_size(n_stats, sizeof(u64)));
+ if (!data)
+ return -ENOMEM;
+
+ if (dev->phydev && !ops->get_ethtool_phy_stats) {
+ ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
+ if (ret < 0)
+ goto out;
+ } else {
+ ops->get_ethtool_phy_stats(dev, &stats, data);
+ }
+ } else {
+ data = NULL;
+ }
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &stats, sizeof(stats)))
+ goto out;
+ useraddr += sizeof(stats);
+ if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
+ goto out;
+ ret = 0;
+
+ out:
+ vfree(data);
+ return ret;
+}
+
+static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_perm_addr epaddr;
+
+ if (copy_from_user(&epaddr, useraddr, sizeof(epaddr)))
+ return -EFAULT;
+
+ if (epaddr.size < dev->addr_len)
+ return -ETOOSMALL;
+ epaddr.size = dev->addr_len;
+
+ if (copy_to_user(useraddr, &epaddr, sizeof(epaddr)))
+ return -EFAULT;
+ useraddr += sizeof(epaddr);
+ if (copy_to_user(useraddr, dev->perm_addr, epaddr.size))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_value(struct net_device *dev, char __user *useraddr,
+ u32 cmd, u32 (*actor)(struct net_device *))
+{
+ struct ethtool_value edata = { .cmd = cmd };
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ edata.data = actor(dev);
+
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr,
+ void (*actor)(struct net_device *, u32))
+{
+ struct ethtool_value edata;
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ actor(dev, edata.data);
+ return 0;
+}
+
+static int ethtool_set_value(struct net_device *dev, char __user *useraddr,
+ int (*actor)(struct net_device *, u32))
+{
+ struct ethtool_value edata;
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ return actor(dev, edata.data);
+}
+
+static noinline_for_stack int ethtool_flash_device(struct net_device *dev,
+ char __user *useraddr)
+{
+ struct ethtool_flash efl;
+
+ if (copy_from_user(&efl, useraddr, sizeof(efl)))
+ return -EFAULT;
+
+ if (!dev->ethtool_ops->flash_device)
+ return -EOPNOTSUPP;
+
+ efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0;
+
+ return dev->ethtool_ops->flash_device(dev, &efl);
+}
+
+static int ethtool_set_dump(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_dump dump;
+
+ if (!dev->ethtool_ops->set_dump)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&dump, useraddr, sizeof(dump)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_dump(dev, &dump);
+}
+
+static int ethtool_get_dump_flag(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ struct ethtool_dump dump;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->get_dump_flag)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&dump, useraddr, sizeof(dump)))
+ return -EFAULT;
+
+ ret = ops->get_dump_flag(dev, &dump);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(useraddr, &dump, sizeof(dump)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_dump_data(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ __u32 len;
+ struct ethtool_dump dump, tmp;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *data = NULL;
+
+ if (!ops->get_dump_data || !ops->get_dump_flag)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&dump, useraddr, sizeof(dump)))
+ return -EFAULT;
+
+ memset(&tmp, 0, sizeof(tmp));
+ tmp.cmd = ETHTOOL_GET_DUMP_FLAG;
+ ret = ops->get_dump_flag(dev, &tmp);
+ if (ret)
+ return ret;
+
+ len = min(tmp.len, dump.len);
+ if (!len)
+ return -EFAULT;
+
+ /* Don't ever let the driver think there's more space available
+ * than it requested with .get_dump_flag().
+ */
+ dump.len = len;
+
+ /* Always allocate enough space to hold the whole thing so that the
+ * driver does not need to check the length and bother with partial
+ * dumping.
+ */
+ data = vzalloc(tmp.len);
+ if (!data)
+ return -ENOMEM;
+ ret = ops->get_dump_data(dev, &dump, data);
+ if (ret)
+ goto out;
+
+ /* There are two sane possibilities:
+ * 1. The driver's .get_dump_data() does not touch dump.len.
+ * 2. Or it may set dump.len to how much it really writes, which
+ * should be tmp.len (or len if it can do a partial dump).
+ * In any case respond to userspace with the actual length of data
+ * it's receiving.
+ */
+ WARN_ON(dump.len != len && dump.len != tmp.len);
+ dump.len = len;
+
+ if (copy_to_user(useraddr, &dump, sizeof(dump))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ useraddr += offsetof(struct ethtool_dump, data);
+ if (copy_to_user(useraddr, data, len))
+ ret = -EFAULT;
+out:
+ vfree(data);
+ return ret;
+}
+
+static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr)
+{
+ int err = 0;
+ struct ethtool_ts_info info;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
+
+ memset(&info, 0, sizeof(info));
+ info.cmd = ETHTOOL_GET_TS_INFO;
+
+ if (phydev && phydev->drv && phydev->drv->ts_info) {
+ err = phydev->drv->ts_info(phydev, &info);
+ } else if (ops->get_ts_info) {
+ err = ops->get_ts_info(dev, &info);
+ } else {
+ info.so_timestamping =
+ SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info.phc_index = -1;
+ }
+
+ if (err)
+ return err;
+
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ err = -EFAULT;
+
+ return err;
+}
+
+static int __ethtool_get_module_info(struct net_device *dev,
+ struct ethtool_modinfo *modinfo)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
+
+ if (dev->sfp_bus)
+ return sfp_get_module_info(dev->sfp_bus, modinfo);
+
+ if (phydev && phydev->drv && phydev->drv->module_info)
+ return phydev->drv->module_info(phydev, modinfo);
+
+ if (ops->get_module_info)
+ return ops->get_module_info(dev, modinfo);
+
+ return -EOPNOTSUPP;
+}
+
+static int ethtool_get_module_info(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ struct ethtool_modinfo modinfo;
+
+ if (copy_from_user(&modinfo, useraddr, sizeof(modinfo)))
+ return -EFAULT;
+
+ ret = __ethtool_get_module_info(dev, &modinfo);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(useraddr, &modinfo, sizeof(modinfo)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int __ethtool_get_module_eeprom(struct net_device *dev,
+ struct ethtool_eeprom *ee, u8 *data)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
+
+ if (dev->sfp_bus)
+ return sfp_get_module_eeprom(dev->sfp_bus, ee, data);
+
+ if (phydev && phydev->drv && phydev->drv->module_eeprom)
+ return phydev->drv->module_eeprom(phydev, ee, data);
+
+ if (ops->get_module_eeprom)
+ return ops->get_module_eeprom(dev, ee, data);
+
+ return -EOPNOTSUPP;
+}
+
+static int ethtool_get_module_eeprom(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ struct ethtool_modinfo modinfo;
+
+ ret = __ethtool_get_module_info(dev, &modinfo);
+ if (ret)
+ return ret;
+
+ return ethtool_get_any_eeprom(dev, useraddr,
+ __ethtool_get_module_eeprom,
+ modinfo.eeprom_len);
+}
+
+static int ethtool_tunable_valid(const struct ethtool_tunable *tuna)
+{
+ switch (tuna->id) {
+ case ETHTOOL_RX_COPYBREAK:
+ case ETHTOOL_TX_COPYBREAK:
+ if (tuna->len != sizeof(u32) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U32)
+ return -EINVAL;
+ break;
+ case ETHTOOL_PFC_PREVENTION_TOUT:
+ if (tuna->len != sizeof(u16) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U16)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *data;
+
+ if (!ops->get_tunable)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kzalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ ret = ops->get_tunable(dev, &tuna, data);
+ if (ret)
+ goto out;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, data, tuna.len))
+ goto out;
+ ret = 0;
+
+out:
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *data;
+
+ if (!ops->set_tunable)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ useraddr += sizeof(tuna);
+ data = memdup_user(useraddr, tuna.len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+ ret = ops->set_tunable(dev, &tuna, data);
+
+ kfree(data);
+ return ret;
+}
+
+static noinline_for_stack int
+ethtool_get_per_queue_coalesce(struct net_device *dev,
+ void __user *useraddr,
+ struct ethtool_per_queue_op *per_queue_opt)
+{
+ u32 bit;
+ int ret;
+ DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE);
+
+ if (!dev->ethtool_ops->get_per_queue_coalesce)
+ return -EOPNOTSUPP;
+
+ useraddr += sizeof(*per_queue_opt);
+
+ bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask,
+ MAX_NUM_QUEUE);
+
+ for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) {
+ struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
+
+ ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce);
+ if (ret != 0)
+ return ret;
+ if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
+ return -EFAULT;
+ useraddr += sizeof(coalesce);
+ }
+
+ return 0;
+}
+
+static noinline_for_stack int
+ethtool_set_per_queue_coalesce(struct net_device *dev,
+ void __user *useraddr,
+ struct ethtool_per_queue_op *per_queue_opt)
+{
+ u32 bit;
+ int i, ret = 0;
+ int n_queue;
+ struct ethtool_coalesce *backup = NULL, *tmp = NULL;
+ DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE);
+
+ if ((!dev->ethtool_ops->set_per_queue_coalesce) ||
+ (!dev->ethtool_ops->get_per_queue_coalesce))
+ return -EOPNOTSUPP;
+
+ useraddr += sizeof(*per_queue_opt);
+
+ bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, MAX_NUM_QUEUE);
+ n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE);
+ tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL);
+ if (!backup)
+ return -ENOMEM;
+
+ for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) {
+ struct ethtool_coalesce coalesce;
+
+ ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp);
+ if (ret != 0)
+ goto roll_back;
+
+ tmp++;
+
+ if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) {
+ ret = -EFAULT;
+ goto roll_back;
+ }
+
+ ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce);
+ if (ret != 0)
+ goto roll_back;
+
+ useraddr += sizeof(coalesce);
+ }
+
+roll_back:
+ if (ret != 0) {
+ tmp = backup;
+ for_each_set_bit(i, queue_mask, bit) {
+ dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp);
+ tmp++;
+ }
+ }
+ kfree(backup);
+
+ return ret;
+}
+
+static int noinline_for_stack ethtool_set_per_queue(struct net_device *dev,
+ void __user *useraddr, u32 sub_cmd)
+{
+ struct ethtool_per_queue_op per_queue_opt;
+
+ if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt)))
+ return -EFAULT;
+
+ if (per_queue_opt.sub_command != sub_cmd)
+ return -EINVAL;
+
+ switch (per_queue_opt.sub_command) {
+ case ETHTOOL_GCOALESCE:
+ return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt);
+ case ETHTOOL_SCOALESCE:
+ return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt);
+ default:
+ return -EOPNOTSUPP;
+ };
+}
+
+static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna)
+{
+ switch (tuna->id) {
+ case ETHTOOL_PHY_DOWNSHIFT:
+ if (tuna->len != sizeof(u8) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U8)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int get_phy_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ struct phy_device *phydev = dev->phydev;
+ void *data;
+
+ if (!(phydev && phydev->drv && phydev->drv->get_tunable))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_phy_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kzalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->get_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+ if (ret)
+ goto out;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, data, tuna.len))
+ goto out;
+ ret = 0;
+
+out:
+ kfree(data);
+ return ret;
+}
+
+static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ struct phy_device *phydev = dev->phydev;
+ void *data;
+
+ if (!(phydev && phydev->drv && phydev->drv->set_tunable))
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_phy_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ useraddr += sizeof(tuna);
+ data = memdup_user(useraddr, tuna.len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->set_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_fecparam fecparam = { ETHTOOL_GFECPARAM };
+ int rc;
+
+ if (!dev->ethtool_ops->get_fecparam)
+ return -EOPNOTSUPP;
+
+ rc = dev->ethtool_ops->get_fecparam(dev, &fecparam);
+ if (rc)
+ return rc;
+
+ if (copy_to_user(useraddr, &fecparam, sizeof(fecparam)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_fecparam fecparam;
+
+ if (!dev->ethtool_ops->set_fecparam)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&fecparam, useraddr, sizeof(fecparam)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_fecparam(dev, &fecparam);
+}
+
+/* The main entry point in this file. Called from net/core/dev_ioctl.c */
+
+int dev_ethtool(struct net *net, struct ifreq *ifr)
+{
+ struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+ void __user *useraddr = ifr->ifr_data;
+ u32 ethcmd, sub_cmd;
+ int rc;
+ netdev_features_t old_features;
+
+ if (!dev || !netif_device_present(dev))
+ return -ENODEV;
+
+ if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
+ return -EFAULT;
+
+ if (ethcmd == ETHTOOL_PERQUEUE) {
+ if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)))
+ return -EFAULT;
+ } else {
+ sub_cmd = ethcmd;
+ }
+ /* Allow some commands to be done by anyone */
+ switch (sub_cmd) {
+ case ETHTOOL_GSET:
+ case ETHTOOL_GDRVINFO:
+ case ETHTOOL_GMSGLVL:
+ case ETHTOOL_GLINK:
+ case ETHTOOL_GCOALESCE:
+ case ETHTOOL_GRINGPARAM:
+ case ETHTOOL_GPAUSEPARAM:
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_GSG:
+ case ETHTOOL_GSSET_INFO:
+ case ETHTOOL_GSTRINGS:
+ case ETHTOOL_GSTATS:
+ case ETHTOOL_GPHYSTATS:
+ case ETHTOOL_GTSO:
+ case ETHTOOL_GPERMADDR:
+ case ETHTOOL_GUFO:
+ case ETHTOOL_GGSO:
+ case ETHTOOL_GGRO:
+ case ETHTOOL_GFLAGS:
+ case ETHTOOL_GPFLAGS:
+ case ETHTOOL_GRXFH:
+ case ETHTOOL_GRXRINGS:
+ case ETHTOOL_GRXCLSRLCNT:
+ case ETHTOOL_GRXCLSRULE:
+ case ETHTOOL_GRXCLSRLALL:
+ case ETHTOOL_GRXFHINDIR:
+ case ETHTOOL_GRSSH:
+ case ETHTOOL_GFEATURES:
+ case ETHTOOL_GCHANNELS:
+ case ETHTOOL_GET_TS_INFO:
+ case ETHTOOL_GEEE:
+ case ETHTOOL_GTUNABLE:
+ case ETHTOOL_PHY_GTUNABLE:
+ case ETHTOOL_GLINKSETTINGS:
+ case ETHTOOL_GFECPARAM:
+ break;
+ default:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ }
+
+ if (dev->ethtool_ops->begin) {
+ rc = dev->ethtool_ops->begin(dev);
+ if (rc < 0)
+ return rc;
+ }
+ old_features = dev->features;
+
+ switch (ethcmd) {
+ case ETHTOOL_GSET:
+ rc = ethtool_get_settings(dev, useraddr);
+ break;
+ case ETHTOOL_SSET:
+ rc = ethtool_set_settings(dev, useraddr);
+ break;
+ case ETHTOOL_GDRVINFO:
+ rc = ethtool_get_drvinfo(dev, useraddr);
+ break;
+ case ETHTOOL_GREGS:
+ rc = ethtool_get_regs(dev, useraddr);
+ break;
+ case ETHTOOL_GWOL:
+ rc = ethtool_get_wol(dev, useraddr);
+ break;
+ case ETHTOOL_SWOL:
+ rc = ethtool_set_wol(dev, useraddr);
+ break;
+ case ETHTOOL_GMSGLVL:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ dev->ethtool_ops->get_msglevel);
+ break;
+ case ETHTOOL_SMSGLVL:
+ rc = ethtool_set_value_void(dev, useraddr,
+ dev->ethtool_ops->set_msglevel);
+ break;
+ case ETHTOOL_GEEE:
+ rc = ethtool_get_eee(dev, useraddr);
+ break;
+ case ETHTOOL_SEEE:
+ rc = ethtool_set_eee(dev, useraddr);
+ break;
+ case ETHTOOL_NWAY_RST:
+ rc = ethtool_nway_reset(dev);
+ break;
+ case ETHTOOL_GLINK:
+ rc = ethtool_get_link(dev, useraddr);
+ break;
+ case ETHTOOL_GEEPROM:
+ rc = ethtool_get_eeprom(dev, useraddr);
+ break;
+ case ETHTOOL_SEEPROM:
+ rc = ethtool_set_eeprom(dev, useraddr);
+ break;
+ case ETHTOOL_GCOALESCE:
+ rc = ethtool_get_coalesce(dev, useraddr);
+ break;
+ case ETHTOOL_SCOALESCE:
+ rc = ethtool_set_coalesce(dev, useraddr);
+ break;
+ case ETHTOOL_GRINGPARAM:
+ rc = ethtool_get_ringparam(dev, useraddr);
+ break;
+ case ETHTOOL_SRINGPARAM:
+ rc = ethtool_set_ringparam(dev, useraddr);
+ break;
+ case ETHTOOL_GPAUSEPARAM:
+ rc = ethtool_get_pauseparam(dev, useraddr);
+ break;
+ case ETHTOOL_SPAUSEPARAM:
+ rc = ethtool_set_pauseparam(dev, useraddr);
+ break;
+ case ETHTOOL_TEST:
+ rc = ethtool_self_test(dev, useraddr);
+ break;
+ case ETHTOOL_GSTRINGS:
+ rc = ethtool_get_strings(dev, useraddr);
+ break;
+ case ETHTOOL_PHYS_ID:
+ rc = ethtool_phys_id(dev, useraddr);
+ break;
+ case ETHTOOL_GSTATS:
+ rc = ethtool_get_stats(dev, useraddr);
+ break;
+ case ETHTOOL_GPERMADDR:
+ rc = ethtool_get_perm_addr(dev, useraddr);
+ break;
+ case ETHTOOL_GFLAGS:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ __ethtool_get_flags);
+ break;
+ case ETHTOOL_SFLAGS:
+ rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
+ break;
+ case ETHTOOL_GPFLAGS:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ dev->ethtool_ops->get_priv_flags);
+ break;
+ case ETHTOOL_SPFLAGS:
+ rc = ethtool_set_value(dev, useraddr,
+ dev->ethtool_ops->set_priv_flags);
+ break;
+ case ETHTOOL_GRXFH:
+ case ETHTOOL_GRXRINGS:
+ case ETHTOOL_GRXCLSRLCNT:
+ case ETHTOOL_GRXCLSRULE:
+ case ETHTOOL_GRXCLSRLALL:
+ rc = ethtool_get_rxnfc(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_SRXFH:
+ case ETHTOOL_SRXCLSRLDEL:
+ case ETHTOOL_SRXCLSRLINS:
+ rc = ethtool_set_rxnfc(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_FLASHDEV:
+ rc = ethtool_flash_device(dev, useraddr);
+ break;
+ case ETHTOOL_RESET:
+ rc = ethtool_reset(dev, useraddr);
+ break;
+ case ETHTOOL_GSSET_INFO:
+ rc = ethtool_get_sset_info(dev, useraddr);
+ break;
+ case ETHTOOL_GRXFHINDIR:
+ rc = ethtool_get_rxfh_indir(dev, useraddr);
+ break;
+ case ETHTOOL_SRXFHINDIR:
+ rc = ethtool_set_rxfh_indir(dev, useraddr);
+ break;
+ case ETHTOOL_GRSSH:
+ rc = ethtool_get_rxfh(dev, useraddr);
+ break;
+ case ETHTOOL_SRSSH:
+ rc = ethtool_set_rxfh(dev, useraddr);
+ break;
+ case ETHTOOL_GFEATURES:
+ rc = ethtool_get_features(dev, useraddr);
+ break;
+ case ETHTOOL_SFEATURES:
+ rc = ethtool_set_features(dev, useraddr);
+ break;
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_GSG:
+ case ETHTOOL_GTSO:
+ case ETHTOOL_GGSO:
+ case ETHTOOL_GGRO:
+ rc = ethtool_get_one_feature(dev, useraddr, ethcmd);
+ break;
+ case ETHTOOL_STXCSUM:
+ case ETHTOOL_SRXCSUM:
+ case ETHTOOL_SSG:
+ case ETHTOOL_STSO:
+ case ETHTOOL_SGSO:
+ case ETHTOOL_SGRO:
+ rc = ethtool_set_one_feature(dev, useraddr, ethcmd);
+ break;
+ case ETHTOOL_GCHANNELS:
+ rc = ethtool_get_channels(dev, useraddr);
+ break;
+ case ETHTOOL_SCHANNELS:
+ rc = ethtool_set_channels(dev, useraddr);
+ break;
+ case ETHTOOL_SET_DUMP:
+ rc = ethtool_set_dump(dev, useraddr);
+ break;
+ case ETHTOOL_GET_DUMP_FLAG:
+ rc = ethtool_get_dump_flag(dev, useraddr);
+ break;
+ case ETHTOOL_GET_DUMP_DATA:
+ rc = ethtool_get_dump_data(dev, useraddr);
+ break;
+ case ETHTOOL_GET_TS_INFO:
+ rc = ethtool_get_ts_info(dev, useraddr);
+ break;
+ case ETHTOOL_GMODULEINFO:
+ rc = ethtool_get_module_info(dev, useraddr);
+ break;
+ case ETHTOOL_GMODULEEEPROM:
+ rc = ethtool_get_module_eeprom(dev, useraddr);
+ break;
+ case ETHTOOL_GTUNABLE:
+ rc = ethtool_get_tunable(dev, useraddr);
+ break;
+ case ETHTOOL_STUNABLE:
+ rc = ethtool_set_tunable(dev, useraddr);
+ break;
+ case ETHTOOL_GPHYSTATS:
+ rc = ethtool_get_phy_stats(dev, useraddr);
+ break;
+ case ETHTOOL_PERQUEUE:
+ rc = ethtool_set_per_queue(dev, useraddr, sub_cmd);
+ break;
+ case ETHTOOL_GLINKSETTINGS:
+ rc = ethtool_get_link_ksettings(dev, useraddr);
+ break;
+ case ETHTOOL_SLINKSETTINGS:
+ rc = ethtool_set_link_ksettings(dev, useraddr);
+ break;
+ case ETHTOOL_PHY_GTUNABLE:
+ rc = get_phy_tunable(dev, useraddr);
+ break;
+ case ETHTOOL_PHY_STUNABLE:
+ rc = set_phy_tunable(dev, useraddr);
+ break;
+ case ETHTOOL_GFECPARAM:
+ rc = ethtool_get_fecparam(dev, useraddr);
+ break;
+ case ETHTOOL_SFECPARAM:
+ rc = ethtool_set_fecparam(dev, useraddr);
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ }
+
+ if (dev->ethtool_ops->complete)
+ dev->ethtool_ops->complete(dev);
+
+ if (old_features != dev->features)
+ netdev_features_change(dev);
+
+ return rc;
+}
diff --git a/net/core/failover.c b/net/core/failover.c
new file mode 100644
index 000000000..b5cd3c727
--- /dev/null
+++ b/net/core/failover.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+/* A common module to handle registrations and notifications for paravirtual
+ * drivers to enable accelerated datapath and support VF live migration.
+ *
+ * The notifier and event handling code is based on netvsc driver.
+ */
+
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <uapi/linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <net/failover.h>
+
+static LIST_HEAD(failover_list);
+static DEFINE_SPINLOCK(failover_lock);
+
+static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops)
+{
+ struct net_device *failover_dev;
+ struct failover *failover;
+
+ spin_lock(&failover_lock);
+ list_for_each_entry(failover, &failover_list, list) {
+ failover_dev = rtnl_dereference(failover->failover_dev);
+ if (ether_addr_equal(failover_dev->perm_addr, mac)) {
+ *ops = rtnl_dereference(failover->ops);
+ spin_unlock(&failover_lock);
+ return failover_dev;
+ }
+ }
+ spin_unlock(&failover_lock);
+ return NULL;
+}
+
+/**
+ * failover_slave_register - Register a slave netdev
+ *
+ * @slave_dev: slave netdev that is being registered
+ *
+ * Registers a slave device to a failover instance. Only ethernet devices
+ * are supported.
+ */
+static int failover_slave_register(struct net_device *slave_dev)
+{
+ struct netdev_lag_upper_info lag_upper_info;
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+ int err;
+
+ if (slave_dev->type != ARPHRD_ETHER)
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (fops && fops->slave_pre_register &&
+ fops->slave_pre_register(slave_dev, failover_dev))
+ goto done;
+
+ err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame,
+ failover_dev);
+ if (err) {
+ netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
+ err);
+ goto done;
+ }
+
+ lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
+ err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL,
+ &lag_upper_info, NULL);
+ if (err) {
+ netdev_err(slave_dev, "can not set failover device %s (err = %d)\n",
+ failover_dev->name, err);
+ goto err_upper_link;
+ }
+
+ slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK);
+
+ if (fops && fops->slave_register &&
+ !fops->slave_register(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+ netdev_upper_dev_unlink(slave_dev, failover_dev);
+ slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK);
+err_upper_link:
+ netdev_rx_handler_unregister(slave_dev);
+done:
+ return NOTIFY_DONE;
+}
+
+/**
+ * failover_slave_unregister - Unregister a slave netdev
+ *
+ * @slave_dev: slave netdev that is being unregistered
+ *
+ * Unregisters a slave device from a failover instance.
+ */
+int failover_slave_unregister(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (fops && fops->slave_pre_unregister &&
+ fops->slave_pre_unregister(slave_dev, failover_dev))
+ goto done;
+
+ netdev_rx_handler_unregister(slave_dev);
+ netdev_upper_dev_unlink(slave_dev, failover_dev);
+ slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK);
+
+ if (fops && fops->slave_unregister &&
+ !fops->slave_unregister(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_GPL(failover_slave_unregister);
+
+static int failover_slave_link_change(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ if (fops && fops->slave_link_change &&
+ !fops->slave_link_change(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int failover_slave_name_change(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ if (fops && fops->slave_name_change &&
+ !fops->slave_name_change(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int
+failover_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+ /* Skip parent events */
+ if (netif_is_failover(event_dev))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ return failover_slave_register(event_dev);
+ case NETDEV_UNREGISTER:
+ return failover_slave_unregister(event_dev);
+ case NETDEV_UP:
+ case NETDEV_DOWN:
+ case NETDEV_CHANGE:
+ return failover_slave_link_change(event_dev);
+ case NETDEV_CHANGENAME:
+ return failover_slave_name_change(event_dev);
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block failover_notifier = {
+ .notifier_call = failover_event,
+};
+
+static void
+failover_existing_slave_register(struct net_device *failover_dev)
+{
+ struct net *net = dev_net(failover_dev);
+ struct net_device *dev;
+
+ rtnl_lock();
+ for_each_netdev(net, dev) {
+ if (netif_is_failover(dev))
+ continue;
+ if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
+ failover_slave_register(dev);
+ }
+ rtnl_unlock();
+}
+
+/**
+ * failover_register - Register a failover instance
+ *
+ * @dev: failover netdev
+ * @ops: failover ops
+ *
+ * Allocate and register a failover instance for a failover netdev. ops
+ * provides handlers for slave device register/unregister/link change/
+ * name change events.
+ *
+ * Return: pointer to failover instance
+ */
+struct failover *failover_register(struct net_device *dev,
+ struct failover_ops *ops)
+{
+ struct failover *failover;
+
+ if (dev->type != ARPHRD_ETHER)
+ return ERR_PTR(-EINVAL);
+
+ failover = kzalloc(sizeof(*failover), GFP_KERNEL);
+ if (!failover)
+ return ERR_PTR(-ENOMEM);
+
+ rcu_assign_pointer(failover->ops, ops);
+ dev_hold(dev);
+ dev->priv_flags |= IFF_FAILOVER;
+ rcu_assign_pointer(failover->failover_dev, dev);
+
+ spin_lock(&failover_lock);
+ list_add_tail(&failover->list, &failover_list);
+ spin_unlock(&failover_lock);
+
+ netdev_info(dev, "failover master:%s registered\n", dev->name);
+
+ failover_existing_slave_register(dev);
+
+ return failover;
+}
+EXPORT_SYMBOL_GPL(failover_register);
+
+/**
+ * failover_unregister - Unregister a failover instance
+ *
+ * @failover: pointer to failover instance
+ *
+ * Unregisters and frees a failover instance.
+ */
+void failover_unregister(struct failover *failover)
+{
+ struct net_device *failover_dev;
+
+ failover_dev = rcu_dereference(failover->failover_dev);
+
+ netdev_info(failover_dev, "failover master:%s unregistered\n",
+ failover_dev->name);
+
+ failover_dev->priv_flags &= ~IFF_FAILOVER;
+ dev_put(failover_dev);
+
+ spin_lock(&failover_lock);
+ list_del(&failover->list);
+ spin_unlock(&failover_lock);
+
+ kfree(failover);
+}
+EXPORT_SYMBOL_GPL(failover_unregister);
+
+static __init int
+failover_init(void)
+{
+ register_netdevice_notifier(&failover_notifier);
+
+ return 0;
+}
+module_init(failover_init);
+
+static __exit
+void failover_exit(void)
+{
+ unregister_netdevice_notifier(&failover_notifier);
+}
+module_exit(failover_exit);
+
+MODULE_DESCRIPTION("Generic failover infrastructure/interface");
+MODULE_LICENSE("GPL v2");
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
new file mode 100644
index 000000000..13a40b831
--- /dev/null
+++ b/net/core/fib_notifier.c
@@ -0,0 +1,189 @@
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/fib_notifier.h>
+
+static ATOMIC_NOTIFIER_HEAD(fib_chain);
+
+int call_fib_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ int err;
+
+ info->net = net;
+ err = nb->notifier_call(nb, event_type, info);
+ return notifier_to_errno(err);
+}
+EXPORT_SYMBOL(call_fib_notifier);
+
+int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ int err;
+
+ info->net = net;
+ err = atomic_notifier_call_chain(&fib_chain, event_type, info);
+ return notifier_to_errno(err);
+}
+EXPORT_SYMBOL(call_fib_notifiers);
+
+static unsigned int fib_seq_sum(void)
+{
+ struct fib_notifier_ops *ops;
+ unsigned int fib_seq = 0;
+ struct net *net;
+
+ rtnl_lock();
+ down_read(&net_rwsem);
+ for_each_net(net) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) {
+ if (!try_module_get(ops->owner))
+ continue;
+ fib_seq += ops->fib_seq_read(net);
+ module_put(ops->owner);
+ }
+ rcu_read_unlock();
+ }
+ up_read(&net_rwsem);
+ rtnl_unlock();
+
+ return fib_seq;
+}
+
+static int fib_net_dump(struct net *net, struct notifier_block *nb)
+{
+ struct fib_notifier_ops *ops;
+
+ list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) {
+ int err;
+
+ if (!try_module_get(ops->owner))
+ continue;
+ err = ops->fib_dump(net, nb);
+ module_put(ops->owner);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static bool fib_dump_is_consistent(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb),
+ unsigned int fib_seq)
+{
+ atomic_notifier_chain_register(&fib_chain, nb);
+ if (fib_seq == fib_seq_sum())
+ return true;
+ atomic_notifier_chain_unregister(&fib_chain, nb);
+ if (cb)
+ cb(nb);
+ return false;
+}
+
+#define FIB_DUMP_MAX_RETRIES 5
+int register_fib_notifier(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb))
+{
+ int retries = 0;
+ int err;
+
+ do {
+ unsigned int fib_seq = fib_seq_sum();
+ struct net *net;
+
+ rcu_read_lock();
+ for_each_net_rcu(net) {
+ err = fib_net_dump(net, nb);
+ if (err)
+ goto err_fib_net_dump;
+ }
+ rcu_read_unlock();
+
+ if (fib_dump_is_consistent(nb, cb, fib_seq))
+ return 0;
+ } while (++retries < FIB_DUMP_MAX_RETRIES);
+
+ return -EBUSY;
+
+err_fib_net_dump:
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(register_fib_notifier);
+
+int unregister_fib_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&fib_chain, nb);
+}
+EXPORT_SYMBOL(unregister_fib_notifier);
+
+static int __fib_notifier_ops_register(struct fib_notifier_ops *ops,
+ struct net *net)
+{
+ struct fib_notifier_ops *o;
+
+ list_for_each_entry(o, &net->fib_notifier_ops, list)
+ if (ops->family == o->family)
+ return -EEXIST;
+ list_add_tail_rcu(&ops->list, &net->fib_notifier_ops);
+ return 0;
+}
+
+struct fib_notifier_ops *
+fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net)
+{
+ struct fib_notifier_ops *ops;
+ int err;
+
+ ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL);
+ if (!ops)
+ return ERR_PTR(-ENOMEM);
+
+ err = __fib_notifier_ops_register(ops, net);
+ if (err)
+ goto err_register;
+
+ return ops;
+
+err_register:
+ kfree(ops);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(fib_notifier_ops_register);
+
+void fib_notifier_ops_unregister(struct fib_notifier_ops *ops)
+{
+ list_del_rcu(&ops->list);
+ kfree_rcu(ops, rcu);
+}
+EXPORT_SYMBOL(fib_notifier_ops_unregister);
+
+static int __net_init fib_notifier_net_init(struct net *net)
+{
+ INIT_LIST_HEAD(&net->fib_notifier_ops);
+ return 0;
+}
+
+static void __net_exit fib_notifier_net_exit(struct net *net)
+{
+ WARN_ON_ONCE(!list_empty(&net->fib_notifier_ops));
+}
+
+static struct pernet_operations fib_notifier_net_ops = {
+ .init = fib_notifier_net_init,
+ .exit = fib_notifier_net_exit,
+};
+
+static int __init fib_notifier_init(void)
+{
+ return register_pernet_subsys(&fib_notifier_net_ops);
+}
+
+subsys_initcall(fib_notifier_init);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
new file mode 100644
index 000000000..46a13ed15
--- /dev/null
+++ b/net/core/fib_rules.c
@@ -0,0 +1,1235 @@
+/*
+ * net/core/fib_rules.c Generic Routing Rules
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/fib_rules.h>
+#include <net/ip_tunnels.h>
+
+static const struct fib_kuid_range fib_kuid_range_unset = {
+ KUIDT_INIT(0),
+ KUIDT_INIT(~0),
+};
+
+bool fib_rule_matchall(const struct fib_rule *rule)
+{
+ if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id ||
+ rule->flags)
+ return false;
+ if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1)
+ return false;
+ if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) ||
+ !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end))
+ return false;
+ if (fib_rule_port_range_set(&rule->sport_range))
+ return false;
+ if (fib_rule_port_range_set(&rule->dport_range))
+ return false;
+ return true;
+}
+EXPORT_SYMBOL_GPL(fib_rule_matchall);
+
+int fib_default_rule_add(struct fib_rules_ops *ops,
+ u32 pref, u32 table, u32 flags)
+{
+ struct fib_rule *r;
+
+ r = kzalloc(ops->rule_size, GFP_KERNEL);
+ if (r == NULL)
+ return -ENOMEM;
+
+ refcount_set(&r->refcnt, 1);
+ r->action = FR_ACT_TO_TBL;
+ r->pref = pref;
+ r->table = table;
+ r->flags = flags;
+ r->proto = RTPROT_KERNEL;
+ r->fr_net = ops->fro_net;
+ r->uid_range = fib_kuid_range_unset;
+
+ r->suppress_prefixlen = -1;
+ r->suppress_ifgroup = -1;
+
+ /* The lock is not required here, the list in unreacheable
+ * at the moment this function is called */
+ list_add_tail(&r->list, &ops->rules_list);
+ return 0;
+}
+EXPORT_SYMBOL(fib_default_rule_add);
+
+static u32 fib_default_rule_pref(struct fib_rules_ops *ops)
+{
+ struct list_head *pos;
+ struct fib_rule *rule;
+
+ if (!list_empty(&ops->rules_list)) {
+ pos = ops->rules_list.next;
+ if (pos->next != &ops->rules_list) {
+ rule = list_entry(pos->next, struct fib_rule, list);
+ if (rule->pref)
+ return rule->pref - 1;
+ }
+ }
+
+ return 0;
+}
+
+static void notify_rule_change(int event, struct fib_rule *rule,
+ struct fib_rules_ops *ops, struct nlmsghdr *nlh,
+ u32 pid);
+
+static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family)
+{
+ struct fib_rules_ops *ops;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &net->rules_ops, list) {
+ if (ops->family == family) {
+ if (!try_module_get(ops->owner))
+ ops = NULL;
+ rcu_read_unlock();
+ return ops;
+ }
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static void rules_ops_put(struct fib_rules_ops *ops)
+{
+ if (ops)
+ module_put(ops->owner);
+}
+
+static void flush_route_cache(struct fib_rules_ops *ops)
+{
+ if (ops->flush_cache)
+ ops->flush_cache(ops);
+}
+
+static int __fib_rules_register(struct fib_rules_ops *ops)
+{
+ int err = -EEXIST;
+ struct fib_rules_ops *o;
+ struct net *net;
+
+ net = ops->fro_net;
+
+ if (ops->rule_size < sizeof(struct fib_rule))
+ return -EINVAL;
+
+ if (ops->match == NULL || ops->configure == NULL ||
+ ops->compare == NULL || ops->fill == NULL ||
+ ops->action == NULL)
+ return -EINVAL;
+
+ spin_lock(&net->rules_mod_lock);
+ list_for_each_entry(o, &net->rules_ops, list)
+ if (ops->family == o->family)
+ goto errout;
+
+ list_add_tail_rcu(&ops->list, &net->rules_ops);
+ err = 0;
+errout:
+ spin_unlock(&net->rules_mod_lock);
+
+ return err;
+}
+
+struct fib_rules_ops *
+fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net)
+{
+ struct fib_rules_ops *ops;
+ int err;
+
+ ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL);
+ if (ops == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&ops->rules_list);
+ ops->fro_net = net;
+
+ err = __fib_rules_register(ops);
+ if (err) {
+ kfree(ops);
+ ops = ERR_PTR(err);
+ }
+
+ return ops;
+}
+EXPORT_SYMBOL_GPL(fib_rules_register);
+
+static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
+{
+ struct fib_rule *rule, *tmp;
+
+ list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) {
+ list_del_rcu(&rule->list);
+ if (ops->delete)
+ ops->delete(rule);
+ fib_rule_put(rule);
+ }
+}
+
+void fib_rules_unregister(struct fib_rules_ops *ops)
+{
+ struct net *net = ops->fro_net;
+
+ spin_lock(&net->rules_mod_lock);
+ list_del_rcu(&ops->list);
+ spin_unlock(&net->rules_mod_lock);
+
+ fib_rules_cleanup_ops(ops);
+ kfree_rcu(ops, rcu);
+}
+EXPORT_SYMBOL_GPL(fib_rules_unregister);
+
+static int uid_range_set(struct fib_kuid_range *range)
+{
+ return uid_valid(range->start) && uid_valid(range->end);
+}
+
+static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb)
+{
+ struct fib_rule_uid_range *in;
+ struct fib_kuid_range out;
+
+ in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]);
+
+ out.start = make_kuid(current_user_ns(), in->start);
+ out.end = make_kuid(current_user_ns(), in->end);
+
+ return out;
+}
+
+static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range)
+{
+ struct fib_rule_uid_range out = {
+ from_kuid_munged(current_user_ns(), range->start),
+ from_kuid_munged(current_user_ns(), range->end)
+ };
+
+ return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out);
+}
+
+static int nla_get_port_range(struct nlattr *pattr,
+ struct fib_rule_port_range *port_range)
+{
+ const struct fib_rule_port_range *pr = nla_data(pattr);
+
+ if (!fib_rule_port_range_valid(pr))
+ return -EINVAL;
+
+ port_range->start = pr->start;
+ port_range->end = pr->end;
+
+ return 0;
+}
+
+static int nla_put_port_range(struct sk_buff *skb, int attrtype,
+ struct fib_rule_port_range *range)
+{
+ return nla_put(skb, attrtype, sizeof(*range), range);
+}
+
+static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
+ struct flowi *fl, int flags,
+ struct fib_lookup_arg *arg)
+{
+ int ret = 0;
+
+ if (rule->iifindex && (rule->iifindex != fl->flowi_iif))
+ goto out;
+
+ if (rule->oifindex && (rule->oifindex != fl->flowi_oif))
+ goto out;
+
+ if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
+ goto out;
+
+ if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id))
+ goto out;
+
+ if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg))
+ goto out;
+
+ if (uid_lt(fl->flowi_uid, rule->uid_range.start) ||
+ uid_gt(fl->flowi_uid, rule->uid_range.end))
+ goto out;
+
+ ret = ops->match(rule, fl, flags);
+out:
+ return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
+}
+
+int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct fib_rule *rule;
+ int err;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(rule, &ops->rules_list, list) {
+jumped:
+ if (!fib_rule_match(rule, ops, fl, flags, arg))
+ continue;
+
+ if (rule->action == FR_ACT_GOTO) {
+ struct fib_rule *target;
+
+ target = rcu_dereference(rule->ctarget);
+ if (target == NULL) {
+ continue;
+ } else {
+ rule = target;
+ goto jumped;
+ }
+ } else if (rule->action == FR_ACT_NOP)
+ continue;
+ else
+ err = ops->action(rule, fl, flags, arg);
+
+ if (!err && ops->suppress && ops->suppress(rule, arg))
+ continue;
+
+ if (err != -EAGAIN) {
+ if ((arg->flags & FIB_LOOKUP_NOREF) ||
+ likely(refcount_inc_not_zero(&rule->refcnt))) {
+ arg->rule = rule;
+ goto out;
+ }
+ break;
+ }
+ }
+
+ err = -ESRCH;
+out:
+ rcu_read_unlock();
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(fib_rules_lookup);
+
+static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct fib_rule *rule, int family)
+{
+ struct fib_rule_notifier_info info = {
+ .info.family = family,
+ .rule = rule,
+ };
+
+ return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static int call_fib_rule_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct fib_rule *rule,
+ struct fib_rules_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_rule_notifier_info info = {
+ .info.family = ops->family,
+ .info.extack = extack,
+ .rule = rule,
+ };
+
+ ops->fib_rules_seq++;
+ return call_fib_notifiers(net, event_type, &info.info);
+}
+
+/* Called with rcu_read_lock() */
+int fib_rules_dump(struct net *net, struct notifier_block *nb, int family)
+{
+ struct fib_rules_ops *ops;
+ struct fib_rule *rule;
+
+ ops = lookup_rules_ops(net, family);
+ if (!ops)
+ return -EAFNOSUPPORT;
+ list_for_each_entry_rcu(rule, &ops->rules_list, list)
+ call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule,
+ family);
+ rules_ops_put(ops);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fib_rules_dump);
+
+unsigned int fib_rules_seq_read(struct net *net, int family)
+{
+ unsigned int fib_rules_seq;
+ struct fib_rules_ops *ops;
+
+ ASSERT_RTNL();
+
+ ops = lookup_rules_ops(net, family);
+ if (!ops)
+ return 0;
+ fib_rules_seq = ops->fib_rules_seq;
+ rules_ops_put(ops);
+
+ return fib_rules_seq;
+}
+EXPORT_SYMBOL_GPL(fib_rules_seq_read);
+
+static struct fib_rule *rule_find(struct fib_rules_ops *ops,
+ struct fib_rule_hdr *frh,
+ struct nlattr **tb,
+ struct fib_rule *rule,
+ bool user_priority)
+{
+ struct fib_rule *r;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (rule->action && r->action != rule->action)
+ continue;
+
+ if (rule->table && r->table != rule->table)
+ continue;
+
+ if (user_priority && r->pref != rule->pref)
+ continue;
+
+ if (rule->iifname[0] &&
+ memcmp(r->iifname, rule->iifname, IFNAMSIZ))
+ continue;
+
+ if (rule->oifname[0] &&
+ memcmp(r->oifname, rule->oifname, IFNAMSIZ))
+ continue;
+
+ if (rule->mark && r->mark != rule->mark)
+ continue;
+
+ if (rule->suppress_ifgroup != -1 &&
+ r->suppress_ifgroup != rule->suppress_ifgroup)
+ continue;
+
+ if (rule->suppress_prefixlen != -1 &&
+ r->suppress_prefixlen != rule->suppress_prefixlen)
+ continue;
+
+ if (rule->mark_mask && r->mark_mask != rule->mark_mask)
+ continue;
+
+ if (rule->tun_id && r->tun_id != rule->tun_id)
+ continue;
+
+ if (r->fr_net != rule->fr_net)
+ continue;
+
+ if (rule->l3mdev && r->l3mdev != rule->l3mdev)
+ continue;
+
+ if (uid_range_set(&rule->uid_range) &&
+ (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+ !uid_eq(r->uid_range.end, rule->uid_range.end)))
+ continue;
+
+ if (rule->ip_proto && r->ip_proto != rule->ip_proto)
+ continue;
+
+ if (rule->proto && r->proto != rule->proto)
+ continue;
+
+ if (fib_rule_port_range_set(&rule->sport_range) &&
+ !fib_rule_port_range_compare(&r->sport_range,
+ &rule->sport_range))
+ continue;
+
+ if (fib_rule_port_range_set(&rule->dport_range) &&
+ !fib_rule_port_range_compare(&r->dport_range,
+ &rule->dport_range))
+ continue;
+
+ if (!ops->compare(r, frh, tb))
+ continue;
+ return r;
+ }
+
+ return NULL;
+}
+
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
+ struct netlink_ext_ack *extack)
+{
+ nlrule->l3mdev = nla_get_u8(nla);
+ if (nlrule->l3mdev != 1) {
+ NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute");
+ return -1;
+ }
+
+ return 0;
+}
+#else
+static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
+ struct netlink_ext_ack *extack)
+{
+ NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel");
+ return -1;
+}
+#endif
+
+static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ struct fib_rules_ops *ops,
+ struct nlattr *tb[],
+ struct fib_rule **rule,
+ bool *user_priority)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ struct fib_rule *nlrule = NULL;
+ int err = -EINVAL;
+
+ if (frh->src_len)
+ if (!tb[FRA_SRC] ||
+ frh->src_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_SRC]) != ops->addr_size) {
+ NL_SET_ERR_MSG(extack, "Invalid source address");
+ goto errout;
+ }
+
+ if (frh->dst_len)
+ if (!tb[FRA_DST] ||
+ frh->dst_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_DST]) != ops->addr_size) {
+ NL_SET_ERR_MSG(extack, "Invalid dst address");
+ goto errout;
+ }
+
+ nlrule = kzalloc(ops->rule_size, GFP_KERNEL);
+ if (!nlrule) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ refcount_set(&nlrule->refcnt, 1);
+ nlrule->fr_net = net;
+
+ if (tb[FRA_PRIORITY]) {
+ nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+ *user_priority = true;
+ } else {
+ nlrule->pref = fib_default_rule_pref(ops);
+ }
+
+ nlrule->proto = tb[FRA_PROTOCOL] ?
+ nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC;
+
+ if (tb[FRA_IIFNAME]) {
+ struct net_device *dev;
+
+ nlrule->iifindex = -1;
+ nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+ dev = __dev_get_by_name(net, nlrule->iifname);
+ if (dev)
+ nlrule->iifindex = dev->ifindex;
+ }
+
+ if (tb[FRA_OIFNAME]) {
+ struct net_device *dev;
+
+ nlrule->oifindex = -1;
+ nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
+ dev = __dev_get_by_name(net, nlrule->oifname);
+ if (dev)
+ nlrule->oifindex = dev->ifindex;
+ }
+
+ if (tb[FRA_FWMARK]) {
+ nlrule->mark = nla_get_u32(tb[FRA_FWMARK]);
+ if (nlrule->mark)
+ /* compatibility: if the mark value is non-zero all bits
+ * are compared unless a mask is explicitly specified.
+ */
+ nlrule->mark_mask = 0xFFFFFFFF;
+ }
+
+ if (tb[FRA_FWMASK])
+ nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+
+ if (tb[FRA_TUN_ID])
+ nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
+
+ err = -EINVAL;
+ if (tb[FRA_L3MDEV] &&
+ fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0)
+ goto errout_free;
+
+ nlrule->action = frh->action;
+ nlrule->flags = frh->flags;
+ nlrule->table = frh_get_table(frh, tb);
+ if (tb[FRA_SUPPRESS_PREFIXLEN])
+ nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
+ else
+ nlrule->suppress_prefixlen = -1;
+
+ if (tb[FRA_SUPPRESS_IFGROUP])
+ nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
+ else
+ nlrule->suppress_ifgroup = -1;
+
+ if (tb[FRA_GOTO]) {
+ if (nlrule->action != FR_ACT_GOTO) {
+ NL_SET_ERR_MSG(extack, "Unexpected goto");
+ goto errout_free;
+ }
+
+ nlrule->target = nla_get_u32(tb[FRA_GOTO]);
+ /* Backward jumps are prohibited to avoid endless loops */
+ if (nlrule->target <= nlrule->pref) {
+ NL_SET_ERR_MSG(extack, "Backward goto not supported");
+ goto errout_free;
+ }
+ } else if (nlrule->action == FR_ACT_GOTO) {
+ NL_SET_ERR_MSG(extack, "Missing goto target for action goto");
+ goto errout_free;
+ }
+
+ if (nlrule->l3mdev && nlrule->table) {
+ NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive");
+ goto errout_free;
+ }
+
+ if (tb[FRA_UID_RANGE]) {
+ if (current_user_ns() != net->user_ns) {
+ err = -EPERM;
+ NL_SET_ERR_MSG(extack, "No permission to set uid");
+ goto errout_free;
+ }
+
+ nlrule->uid_range = nla_get_kuid_range(tb);
+
+ if (!uid_range_set(&nlrule->uid_range) ||
+ !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) {
+ NL_SET_ERR_MSG(extack, "Invalid uid range");
+ goto errout_free;
+ }
+ } else {
+ nlrule->uid_range = fib_kuid_range_unset;
+ }
+
+ if (tb[FRA_IP_PROTO])
+ nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
+
+ if (tb[FRA_SPORT_RANGE]) {
+ err = nla_get_port_range(tb[FRA_SPORT_RANGE],
+ &nlrule->sport_range);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Invalid sport range");
+ goto errout_free;
+ }
+ }
+
+ if (tb[FRA_DPORT_RANGE]) {
+ err = nla_get_port_range(tb[FRA_DPORT_RANGE],
+ &nlrule->dport_range);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Invalid dport range");
+ goto errout_free;
+ }
+ }
+
+ *rule = nlrule;
+
+ return 0;
+
+errout_free:
+ kfree(nlrule);
+errout:
+ return err;
+}
+
+static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
+ struct nlattr **tb, struct fib_rule *rule)
+{
+ struct fib_rule *r;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->action != rule->action)
+ continue;
+
+ if (r->table != rule->table)
+ continue;
+
+ if (r->pref != rule->pref)
+ continue;
+
+ if (memcmp(r->iifname, rule->iifname, IFNAMSIZ))
+ continue;
+
+ if (memcmp(r->oifname, rule->oifname, IFNAMSIZ))
+ continue;
+
+ if (r->mark != rule->mark)
+ continue;
+
+ if (r->suppress_ifgroup != rule->suppress_ifgroup)
+ continue;
+
+ if (r->suppress_prefixlen != rule->suppress_prefixlen)
+ continue;
+
+ if (r->mark_mask != rule->mark_mask)
+ continue;
+
+ if (r->tun_id != rule->tun_id)
+ continue;
+
+ if (r->fr_net != rule->fr_net)
+ continue;
+
+ if (r->l3mdev != rule->l3mdev)
+ continue;
+
+ if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+ !uid_eq(r->uid_range.end, rule->uid_range.end))
+ continue;
+
+ if (r->ip_proto != rule->ip_proto)
+ continue;
+
+ if (r->proto != rule->proto)
+ continue;
+
+ if (!fib_rule_port_range_compare(&r->sport_range,
+ &rule->sport_range))
+ continue;
+
+ if (!fib_rule_port_range_compare(&r->dport_range,
+ &rule->dport_range))
+ continue;
+
+ if (!ops->compare(r, frh, tb))
+ continue;
+ return 1;
+ }
+ return 0;
+}
+
+int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ struct fib_rules_ops *ops = NULL;
+ struct fib_rule *rule = NULL, *r, *last = NULL;
+ struct nlattr *tb[FRA_MAX + 1];
+ int err = -EINVAL, unresolved = 0;
+ bool user_priority = false;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ NL_SET_ERR_MSG(extack, "Invalid msg length");
+ goto errout;
+ }
+
+ ops = lookup_rules_ops(net, frh->family);
+ if (!ops) {
+ err = -EAFNOSUPPORT;
+ NL_SET_ERR_MSG(extack, "Rule family not supported");
+ goto errout;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Error parsing msg");
+ goto errout;
+ }
+
+ err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority);
+ if (err)
+ goto errout;
+
+ if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
+ rule_exists(ops, frh, tb, rule)) {
+ err = -EEXIST;
+ goto errout_free;
+ }
+
+ err = ops->configure(rule, skb, frh, tb, extack);
+ if (err < 0)
+ goto errout_free;
+
+ err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops,
+ extack);
+ if (err < 0)
+ goto errout_free;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->pref == rule->target) {
+ RCU_INIT_POINTER(rule->ctarget, r);
+ break;
+ }
+ }
+
+ if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
+ unresolved = 1;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->pref > rule->pref)
+ break;
+ last = r;
+ }
+
+ if (last)
+ list_add_rcu(&rule->list, &last->list);
+ else
+ list_add_rcu(&rule->list, &ops->rules_list);
+
+ if (ops->unresolved_rules) {
+ /*
+ * There are unresolved goto rules in the list, check if
+ * any of them are pointing to this new rule.
+ */
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->action == FR_ACT_GOTO &&
+ r->target == rule->pref &&
+ rtnl_dereference(r->ctarget) == NULL) {
+ rcu_assign_pointer(r->ctarget, rule);
+ if (--ops->unresolved_rules == 0)
+ break;
+ }
+ }
+ }
+
+ if (rule->action == FR_ACT_GOTO)
+ ops->nr_goto_rules++;
+
+ if (unresolved)
+ ops->unresolved_rules++;
+
+ if (rule->tun_id)
+ ip_tunnel_need_metadata();
+
+ notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
+ flush_route_cache(ops);
+ rules_ops_put(ops);
+ return 0;
+
+errout_free:
+ kfree(rule);
+errout:
+ rules_ops_put(ops);
+ return err;
+}
+EXPORT_SYMBOL_GPL(fib_nl_newrule);
+
+int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ struct fib_rules_ops *ops = NULL;
+ struct fib_rule *rule = NULL, *r, *nlrule = NULL;
+ struct nlattr *tb[FRA_MAX+1];
+ int err = -EINVAL;
+ bool user_priority = false;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ NL_SET_ERR_MSG(extack, "Invalid msg length");
+ goto errout;
+ }
+
+ ops = lookup_rules_ops(net, frh->family);
+ if (ops == NULL) {
+ err = -EAFNOSUPPORT;
+ NL_SET_ERR_MSG(extack, "Rule family not supported");
+ goto errout;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Error parsing msg");
+ goto errout;
+ }
+
+ err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority);
+ if (err)
+ goto errout;
+
+ rule = rule_find(ops, frh, tb, nlrule, user_priority);
+ if (!rule) {
+ err = -ENOENT;
+ goto errout;
+ }
+
+ if (rule->flags & FIB_RULE_PERMANENT) {
+ err = -EPERM;
+ goto errout;
+ }
+
+ if (ops->delete) {
+ err = ops->delete(rule);
+ if (err)
+ goto errout;
+ }
+
+ if (rule->tun_id)
+ ip_tunnel_unneed_metadata();
+
+ list_del_rcu(&rule->list);
+
+ if (rule->action == FR_ACT_GOTO) {
+ ops->nr_goto_rules--;
+ if (rtnl_dereference(rule->ctarget) == NULL)
+ ops->unresolved_rules--;
+ }
+
+ /*
+ * Check if this rule is a target to any of them. If so,
+ * adjust to the next one with the same preference or
+ * disable them. As this operation is eventually very
+ * expensive, it is only performed if goto rules, except
+ * current if it is goto rule, have actually been added.
+ */
+ if (ops->nr_goto_rules > 0) {
+ struct fib_rule *n;
+
+ n = list_next_entry(rule, list);
+ if (&n->list == &ops->rules_list || n->pref != rule->pref)
+ n = NULL;
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (rtnl_dereference(r->ctarget) != rule)
+ continue;
+ rcu_assign_pointer(r->ctarget, n);
+ if (!n)
+ ops->unresolved_rules++;
+ }
+ }
+
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
+ NULL);
+ notify_rule_change(RTM_DELRULE, rule, ops, nlh,
+ NETLINK_CB(skb).portid);
+ fib_rule_put(rule);
+ flush_route_cache(ops);
+ rules_ops_put(ops);
+ kfree(nlrule);
+ return 0;
+
+errout:
+ kfree(nlrule);
+ rules_ops_put(ops);
+ return err;
+}
+EXPORT_SYMBOL_GPL(fib_nl_delrule);
+
+static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ struct fib_rule *rule)
+{
+ size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr))
+ + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */
+ + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */
+ + nla_total_size(4) /* FRA_PRIORITY */
+ + nla_total_size(4) /* FRA_TABLE */
+ + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
+ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ + nla_total_size(4) /* FRA_FWMARK */
+ + nla_total_size(4) /* FRA_FWMASK */
+ + nla_total_size_64bit(8) /* FRA_TUN_ID */
+ + nla_total_size(sizeof(struct fib_kuid_range))
+ + nla_total_size(1) /* FRA_PROTOCOL */
+ + nla_total_size(1) /* FRA_IP_PROTO */
+ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */
+ + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */
+
+ if (ops->nlmsg_payload)
+ payload += ops->nlmsg_payload(rule);
+
+ return payload;
+}
+
+static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
+ u32 pid, u32 seq, int type, int flags,
+ struct fib_rules_ops *ops)
+{
+ struct nlmsghdr *nlh;
+ struct fib_rule_hdr *frh;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ frh = nlmsg_data(nlh);
+ frh->family = ops->family;
+ frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT;
+ if (nla_put_u32(skb, FRA_TABLE, rule->table))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
+ goto nla_put_failure;
+ frh->res1 = 0;
+ frh->res2 = 0;
+ frh->action = rule->action;
+ frh->flags = rule->flags;
+
+ if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto))
+ goto nla_put_failure;
+
+ if (rule->action == FR_ACT_GOTO &&
+ rcu_access_pointer(rule->ctarget) == NULL)
+ frh->flags |= FIB_RULE_UNRESOLVED;
+
+ if (rule->iifname[0]) {
+ if (nla_put_string(skb, FRA_IIFNAME, rule->iifname))
+ goto nla_put_failure;
+ if (rule->iifindex == -1)
+ frh->flags |= FIB_RULE_IIF_DETACHED;
+ }
+
+ if (rule->oifname[0]) {
+ if (nla_put_string(skb, FRA_OIFNAME, rule->oifname))
+ goto nla_put_failure;
+ if (rule->oifindex == -1)
+ frh->flags |= FIB_RULE_OIF_DETACHED;
+ }
+
+ if ((rule->pref &&
+ nla_put_u32(skb, FRA_PRIORITY, rule->pref)) ||
+ (rule->mark &&
+ nla_put_u32(skb, FRA_FWMARK, rule->mark)) ||
+ ((rule->mark_mask || rule->mark) &&
+ nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) ||
+ (rule->target &&
+ nla_put_u32(skb, FRA_GOTO, rule->target)) ||
+ (rule->tun_id &&
+ nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) ||
+ (rule->l3mdev &&
+ nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) ||
+ (uid_range_set(&rule->uid_range) &&
+ nla_put_uid_range(skb, &rule->uid_range)) ||
+ (fib_rule_port_range_set(&rule->sport_range) &&
+ nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) ||
+ (fib_rule_port_range_set(&rule->dport_range) &&
+ nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) ||
+ (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto)))
+ goto nla_put_failure;
+
+ if (rule->suppress_ifgroup != -1) {
+ if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup))
+ goto nla_put_failure;
+ }
+
+ if (ops->fill(rule, skb, frh) < 0)
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,
+ struct fib_rules_ops *ops)
+{
+ int idx = 0;
+ struct fib_rule *rule;
+ int err = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(rule, &ops->rules_list, list) {
+ if (idx < cb->args[1])
+ goto skip;
+
+ err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWRULE,
+ NLM_F_MULTI, ops);
+ if (err)
+ break;
+skip:
+ idx++;
+ }
+ rcu_read_unlock();
+ cb->args[1] = idx;
+ rules_ops_put(ops);
+
+ return err;
+}
+
+static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rules_ops *ops;
+ int idx = 0, family;
+
+ family = rtnl_msg_family(cb->nlh);
+ if (family != AF_UNSPEC) {
+ /* Protocol specific dump request */
+ ops = lookup_rules_ops(net, family);
+ if (ops == NULL)
+ return -EAFNOSUPPORT;
+
+ dump_rules(skb, cb, ops);
+
+ return skb->len;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &net->rules_ops, list) {
+ if (idx < cb->args[0] || !try_module_get(ops->owner))
+ goto skip;
+
+ if (dump_rules(skb, cb, ops) < 0)
+ break;
+
+ cb->args[1] = 0;
+skip:
+ idx++;
+ }
+ rcu_read_unlock();
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+static void notify_rule_change(int event, struct fib_rule *rule,
+ struct fib_rules_ops *ops, struct nlmsghdr *nlh,
+ u32 pid)
+{
+ struct net *net;
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+
+ net = ops->fro_net;
+ skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL);
+ if (skb == NULL)
+ goto errout;
+
+ err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, ops->nlgroup, err);
+}
+
+static void attach_rules(struct list_head *rules, struct net_device *dev)
+{
+ struct fib_rule *rule;
+
+ list_for_each_entry(rule, rules, list) {
+ if (rule->iifindex == -1 &&
+ strcmp(dev->name, rule->iifname) == 0)
+ rule->iifindex = dev->ifindex;
+ if (rule->oifindex == -1 &&
+ strcmp(dev->name, rule->oifname) == 0)
+ rule->oifindex = dev->ifindex;
+ }
+}
+
+static void detach_rules(struct list_head *rules, struct net_device *dev)
+{
+ struct fib_rule *rule;
+
+ list_for_each_entry(rule, rules, list) {
+ if (rule->iifindex == dev->ifindex)
+ rule->iifindex = -1;
+ if (rule->oifindex == dev->ifindex)
+ rule->oifindex = -1;
+ }
+}
+
+
+static int fib_rules_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct fib_rules_ops *ops;
+
+ ASSERT_RTNL();
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ list_for_each_entry(ops, &net->rules_ops, list)
+ attach_rules(&ops->rules_list, dev);
+ break;
+
+ case NETDEV_CHANGENAME:
+ list_for_each_entry(ops, &net->rules_ops, list) {
+ detach_rules(&ops->rules_list, dev);
+ attach_rules(&ops->rules_list, dev);
+ }
+ break;
+
+ case NETDEV_UNREGISTER:
+ list_for_each_entry(ops, &net->rules_ops, list)
+ detach_rules(&ops->rules_list, dev);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block fib_rules_notifier = {
+ .notifier_call = fib_rules_event,
+};
+
+static int __net_init fib_rules_net_init(struct net *net)
+{
+ INIT_LIST_HEAD(&net->rules_ops);
+ spin_lock_init(&net->rules_mod_lock);
+ return 0;
+}
+
+static void __net_exit fib_rules_net_exit(struct net *net)
+{
+ WARN_ON_ONCE(!list_empty(&net->rules_ops));
+}
+
+static struct pernet_operations fib_rules_net_ops = {
+ .init = fib_rules_net_init,
+ .exit = fib_rules_net_exit,
+};
+
+static int __init fib_rules_init(void)
+{
+ int err;
+ rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0);
+
+ err = register_pernet_subsys(&fib_rules_net_ops);
+ if (err < 0)
+ goto fail;
+
+ err = register_netdevice_notifier(&fib_rules_notifier);
+ if (err < 0)
+ goto fail_unregister;
+
+ return 0;
+
+fail_unregister:
+ unregister_pernet_subsys(&fib_rules_net_ops);
+fail:
+ rtnl_unregister(PF_UNSPEC, RTM_NEWRULE);
+ rtnl_unregister(PF_UNSPEC, RTM_DELRULE);
+ rtnl_unregister(PF_UNSPEC, RTM_GETRULE);
+ return err;
+}
+
+subsys_initcall(fib_rules_init);
diff --git a/net/core/filter.c b/net/core/filter.c
new file mode 100644
index 000000000..c1310c9d1
--- /dev/null
+++ b/net/core/filter.c
@@ -0,0 +1,7333 @@
+/*
+ * Linux Socket Filter - Kernel level socket filtering
+ *
+ * Based on the design of the Berkeley Packet Filter. The new
+ * internal format has been designed by PLUMgrid:
+ *
+ * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
+ *
+ * Authors:
+ *
+ * Jay Schulist <jschlst@samba.org>
+ * Alexei Starovoitov <ast@plumgrid.com>
+ * Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Andi Kleen - Fix a few bad bugs and races.
+ * Kris Katterjohn - Added many additional checks in bpf_check_classic()
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/sock_diag.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_packet.h>
+#include <linux/if_arp.h>
+#include <linux/gfp.h>
+#include <net/inet_common.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/netlink.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/flow_dissector.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/uaccess.h>
+#include <asm/unaligned.h>
+#include <asm/cmpxchg.h>
+#include <linux/filter.h>
+#include <linux/ratelimit.h>
+#include <linux/seccomp.h>
+#include <linux/if_vlan.h>
+#include <linux/bpf.h>
+#include <net/sch_generic.h>
+#include <net/cls_cgroup.h>
+#include <net/dst_metadata.h>
+#include <net/dst.h>
+#include <net/sock_reuseport.h>
+#include <net/busy_poll.h>
+#include <net/tcp.h>
+#include <net/xfrm.h>
+#include <linux/bpf_trace.h>
+#include <net/xdp_sock.h>
+#include <linux/inetdevice.h>
+#include <net/ip_fib.h>
+#include <net/flow.h>
+#include <net/arp.h>
+#include <net/ipv6.h>
+#include <linux/seg6_local.h>
+#include <net/seg6.h>
+#include <net/seg6_local.h>
+
+/**
+ * sk_filter_trim_cap - run a packet through a socket filter
+ * @sk: sock associated with &sk_buff
+ * @skb: buffer to filter
+ * @cap: limit on how short the eBPF program may trim the packet
+ *
+ * Run the eBPF program and then cut skb->data to correct size returned by
+ * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
+ * than pkt_len we keep whole skb->data. This is the socket level
+ * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
+ * be accepted or -EPERM if the packet should be tossed.
+ *
+ */
+int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
+{
+ int err;
+ struct sk_filter *filter;
+
+ /*
+ * If the skb was allocated from pfmemalloc reserves, only
+ * allow SOCK_MEMALLOC sockets to use it as this socket is
+ * helping free memory
+ */
+ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
+ return -ENOMEM;
+ }
+ err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
+ if (err)
+ return err;
+
+ err = security_sock_rcv_skb(sk, skb);
+ if (err)
+ return err;
+
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter) {
+ struct sock *save_sk = skb->sk;
+ unsigned int pkt_len;
+
+ skb->sk = sk;
+ pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
+ skb->sk = save_sk;
+ err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
+ }
+ rcu_read_unlock();
+
+ return err;
+}
+EXPORT_SYMBOL(sk_filter_trim_cap);
+
+BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
+{
+ return skb_get_poff(skb);
+}
+
+BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
+{
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
+ return 0;
+
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
+ if (a > skb->len - sizeof(struct nlattr))
+ return 0;
+
+ nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
+ if (nla)
+ return (void *) nla - (void *) skb->data;
+
+ return 0;
+}
+
+BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
+{
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
+ return 0;
+
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
+ if (a > skb->len - sizeof(struct nlattr))
+ return 0;
+
+ nla = (struct nlattr *) &skb->data[a];
+ if (nla->nla_len > skb->len - a)
+ return 0;
+
+ nla = nla_find_nested(nla, x);
+ if (nla)
+ return (void *) nla - (void *) skb->data;
+
+ return 0;
+}
+
+BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
+ data, int, headlen, int, offset)
+{
+ u8 tmp, *ptr;
+ const int len = sizeof(tmp);
+
+ if (offset >= 0) {
+ if (headlen - offset >= len)
+ return *(u8 *)(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return tmp;
+ } else {
+ ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+ if (likely(ptr))
+ return *(u8 *)ptr;
+ }
+
+ return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
+ int, offset)
+{
+ return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
+ offset);
+}
+
+BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
+ data, int, headlen, int, offset)
+{
+ u16 tmp, *ptr;
+ const int len = sizeof(tmp);
+
+ if (offset >= 0) {
+ if (headlen - offset >= len)
+ return get_unaligned_be16(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be16_to_cpu(tmp);
+ } else {
+ ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+ if (likely(ptr))
+ return get_unaligned_be16(ptr);
+ }
+
+ return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
+ int, offset)
+{
+ return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
+ offset);
+}
+
+BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
+ data, int, headlen, int, offset)
+{
+ u32 tmp, *ptr;
+ const int len = sizeof(tmp);
+
+ if (likely(offset >= 0)) {
+ if (headlen - offset >= len)
+ return get_unaligned_be32(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be32_to_cpu(tmp);
+ } else {
+ ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+ if (likely(ptr))
+ return get_unaligned_be32(ptr);
+ }
+
+ return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
+ int, offset)
+{
+ return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
+ offset);
+}
+
+BPF_CALL_0(bpf_get_raw_cpu_id)
+{
+ return raw_smp_processor_id();
+}
+
+static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
+ .func = bpf_get_raw_cpu_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (skb_field) {
+ case SKF_AD_MARK:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, mark));
+ break;
+
+ case SKF_AD_PKTTYPE:
+ *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
+#endif
+ break;
+
+ case SKF_AD_QUEUE:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, queue_mapping));
+ break;
+
+ case SKF_AD_VLAN_TAG:
+ case SKF_AD_VLAN_TAG_PRESENT:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
+ BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+
+ /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, vlan_tci));
+ if (skb_field == SKF_AD_VLAN_TAG) {
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg,
+ ~VLAN_TAG_PRESENT);
+ } else {
+ /* dst_reg >>= 12 */
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12);
+ /* dst_reg &= 1 */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
+ }
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static bool convert_bpf_extensions(struct sock_filter *fp,
+ struct bpf_insn **insnp)
+{
+ struct bpf_insn *insn = *insnp;
+ u32 cnt;
+
+ switch (fp->k) {
+ case SKF_AD_OFF + SKF_AD_PROTOCOL:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
+
+ /* A = *(u16 *) (CTX + offsetof(protocol)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, protocol));
+ /* A = ntohs(A) [emitting a nop or swap16] */
+ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
+ break;
+
+ case SKF_AD_OFF + SKF_AD_PKTTYPE:
+ cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_IFINDEX:
+ case SKF_AD_OFF + SKF_AD_HATYPE:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
+ BPF_REG_TMP, BPF_REG_CTX,
+ offsetof(struct sk_buff, dev));
+ /* if (tmp != 0) goto pc + 1 */
+ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
+ *insn++ = BPF_EXIT_INSN();
+ if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
+ offsetof(struct net_device, ifindex));
+ else
+ *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
+ offsetof(struct net_device, type));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_MARK:
+ cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_RXHASH:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
+
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, hash));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_QUEUE:
+ cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_VLAN_TAG:
+ cnt = convert_skb_access(SKF_AD_VLAN_TAG,
+ BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
+ cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
+ BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_VLAN_TPID:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
+
+ /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, vlan_proto));
+ /* A = ntohs(A) [emitting a nop or swap16] */
+ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
+ break;
+
+ case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+ case SKF_AD_OFF + SKF_AD_NLATTR:
+ case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+ case SKF_AD_OFF + SKF_AD_CPU:
+ case SKF_AD_OFF + SKF_AD_RANDOM:
+ /* arg1 = CTX */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
+ /* arg2 = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
+ /* arg3 = X */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
+ /* Emit call(arg1=CTX, arg2=A, arg3=X) */
+ switch (fp->k) {
+ case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+ *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
+ break;
+ case SKF_AD_OFF + SKF_AD_NLATTR:
+ *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
+ break;
+ case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+ *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
+ break;
+ case SKF_AD_OFF + SKF_AD_CPU:
+ *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
+ break;
+ case SKF_AD_OFF + SKF_AD_RANDOM:
+ *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
+ bpf_user_rnd_init_once();
+ break;
+ }
+ break;
+
+ case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
+ /* A ^= X */
+ *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
+ break;
+
+ default:
+ /* This is just a dummy call to avoid letting the compiler
+ * evict __bpf_call_base() as an optimization. Placed here
+ * where no-one bothers.
+ */
+ BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
+ return false;
+ }
+
+ *insnp = insn;
+ return true;
+}
+
+static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
+{
+ const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
+ int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
+ bool endian = BPF_SIZE(fp->code) == BPF_H ||
+ BPF_SIZE(fp->code) == BPF_W;
+ bool indirect = BPF_MODE(fp->code) == BPF_IND;
+ const int ip_align = NET_IP_ALIGN;
+ struct bpf_insn *insn = *insnp;
+ int offset = fp->k;
+
+ if (!indirect &&
+ ((unaligned_ok && offset >= 0) ||
+ (!unaligned_ok && offset >= 0 &&
+ offset + ip_align >= 0 &&
+ offset + ip_align % size == 0))) {
+ bool ldx_off_ok = offset <= S16_MAX;
+
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
+ *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
+ *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
+ size, 2 + endian + (!ldx_off_ok * 2));
+ if (ldx_off_ok) {
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
+ BPF_REG_D, offset);
+ } else {
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset);
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
+ BPF_REG_TMP, 0);
+ }
+ if (endian)
+ *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
+ *insn++ = BPF_JMP_A(8);
+ }
+
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
+ if (!indirect) {
+ *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
+ } else {
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
+ if (fp->k)
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
+ }
+
+ switch (BPF_SIZE(fp->code)) {
+ case BPF_B:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
+ break;
+ case BPF_H:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
+ break;
+ case BPF_W:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
+ break;
+ default:
+ return false;
+ }
+
+ *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
+ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+ *insn = BPF_EXIT_INSN();
+
+ *insnp = insn;
+ return true;
+}
+
+/**
+ * bpf_convert_filter - convert filter program
+ * @prog: the user passed filter program
+ * @len: the length of the user passed filter program
+ * @new_prog: allocated 'struct bpf_prog' or NULL
+ * @new_len: pointer to store length of converted program
+ * @seen_ld_abs: bool whether we've seen ld_abs/ind
+ *
+ * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
+ * style extended BPF (eBPF).
+ * Conversion workflow:
+ *
+ * 1) First pass for calculating the new program length:
+ * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
+ *
+ * 2) 2nd pass to remap in two passes: 1st pass finds new
+ * jump offsets, 2nd pass remapping:
+ * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
+ */
+static int bpf_convert_filter(struct sock_filter *prog, int len,
+ struct bpf_prog *new_prog, int *new_len,
+ bool *seen_ld_abs)
+{
+ int new_flen = 0, pass = 0, target, i, stack_off;
+ struct bpf_insn *new_insn, *first_insn = NULL;
+ struct sock_filter *fp;
+ int *addrs = NULL;
+ u8 bpf_src;
+
+ BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
+ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
+
+ if (len <= 0 || len > BPF_MAXINSNS)
+ return -EINVAL;
+
+ if (new_prog) {
+ first_insn = new_prog->insnsi;
+ addrs = kcalloc(len, sizeof(*addrs),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!addrs)
+ return -ENOMEM;
+ }
+
+do_pass:
+ new_insn = first_insn;
+ fp = prog;
+
+ /* Classic BPF related prologue emission. */
+ if (new_prog) {
+ /* Classic BPF expects A and X to be reset first. These need
+ * to be guaranteed to be the first two instructions.
+ */
+ *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+ *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
+
+ /* All programs must keep CTX in callee saved BPF_REG_CTX.
+ * In eBPF case it's done by the compiler, here we need to
+ * do this ourself. Initial CTX is present in BPF_REG_ARG1.
+ */
+ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
+ if (*seen_ld_abs) {
+ /* For packet access in classic BPF, cache skb->data
+ * in callee-saved BPF R8 and skb->len - skb->data_len
+ * (headlen) in BPF R9. Since classic BPF is read-only
+ * on CTX, we only need to cache it once.
+ */
+ *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ BPF_REG_D, BPF_REG_CTX,
+ offsetof(struct sk_buff, data));
+ *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
+ offsetof(struct sk_buff, len));
+ *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
+ offsetof(struct sk_buff, data_len));
+ *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
+ }
+ } else {
+ new_insn += 3;
+ }
+
+ for (i = 0; i < len; fp++, i++) {
+ struct bpf_insn tmp_insns[32] = { };
+ struct bpf_insn *insn = tmp_insns;
+
+ if (addrs)
+ addrs[i] = new_insn - first_insn;
+
+ switch (fp->code) {
+ /* All arithmetic insns and skb loads map as-is. */
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_X:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ case BPF_ALU | BPF_NEG:
+ case BPF_LD | BPF_ABS | BPF_W:
+ case BPF_LD | BPF_ABS | BPF_H:
+ case BPF_LD | BPF_ABS | BPF_B:
+ case BPF_LD | BPF_IND | BPF_W:
+ case BPF_LD | BPF_IND | BPF_H:
+ case BPF_LD | BPF_IND | BPF_B:
+ /* Check for overloaded BPF extension and
+ * directly convert it if found, otherwise
+ * just move on with mapping.
+ */
+ if (BPF_CLASS(fp->code) == BPF_LD &&
+ BPF_MODE(fp->code) == BPF_ABS &&
+ convert_bpf_extensions(fp, &insn))
+ break;
+ if (BPF_CLASS(fp->code) == BPF_LD &&
+ convert_bpf_ld_abs(fp, &insn)) {
+ *seen_ld_abs = true;
+ break;
+ }
+
+ if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
+ fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
+ *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
+ /* Error with exception code on div/mod by 0.
+ * For cBPF programs, this was always return 0.
+ */
+ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2);
+ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+ *insn++ = BPF_EXIT_INSN();
+ }
+
+ *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
+ break;
+
+ /* Jump transformation cannot use BPF block macros
+ * everywhere as offset calculation and target updates
+ * require a bit more work than the rest, i.e. jump
+ * opcodes map as-is, but offsets need adjustment.
+ */
+
+#define BPF_EMIT_JMP \
+ do { \
+ const s32 off_min = S16_MIN, off_max = S16_MAX; \
+ s32 off; \
+ \
+ if (target >= len || target < 0) \
+ goto err; \
+ off = addrs ? addrs[target] - addrs[i] - 1 : 0; \
+ /* Adjust pc relative offset for 2nd or 3rd insn. */ \
+ off -= insn - tmp_insns; \
+ /* Reject anything not fitting into insn->off. */ \
+ if (off < off_min || off > off_max) \
+ goto err; \
+ insn->off = off; \
+ } while (0)
+
+ case BPF_JMP | BPF_JA:
+ target = i + fp->k + 1;
+ insn->code = fp->code;
+ BPF_EMIT_JMP;
+ break;
+
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
+ /* BPF immediates are signed, zero extend
+ * immediate into tmp register and use it
+ * in compare insn.
+ */
+ *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
+
+ insn->dst_reg = BPF_REG_A;
+ insn->src_reg = BPF_REG_TMP;
+ bpf_src = BPF_X;
+ } else {
+ insn->dst_reg = BPF_REG_A;
+ insn->imm = fp->k;
+ bpf_src = BPF_SRC(fp->code);
+ insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
+ }
+
+ /* Common case where 'jump_false' is next insn. */
+ if (fp->jf == 0) {
+ insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+ target = i + fp->jt + 1;
+ BPF_EMIT_JMP;
+ break;
+ }
+
+ /* Convert some jumps when 'jump_true' is next insn. */
+ if (fp->jt == 0) {
+ switch (BPF_OP(fp->code)) {
+ case BPF_JEQ:
+ insn->code = BPF_JMP | BPF_JNE | bpf_src;
+ break;
+ case BPF_JGT:
+ insn->code = BPF_JMP | BPF_JLE | bpf_src;
+ break;
+ case BPF_JGE:
+ insn->code = BPF_JMP | BPF_JLT | bpf_src;
+ break;
+ default:
+ goto jmp_rest;
+ }
+
+ target = i + fp->jf + 1;
+ BPF_EMIT_JMP;
+ break;
+ }
+jmp_rest:
+ /* Other jumps are mapped into two insns: Jxx and JA. */
+ target = i + fp->jt + 1;
+ insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+ BPF_EMIT_JMP;
+ insn++;
+
+ insn->code = BPF_JMP | BPF_JA;
+ target = i + fp->jf + 1;
+ BPF_EMIT_JMP;
+ break;
+
+ /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
+ case BPF_LDX | BPF_MSH | BPF_B: {
+ struct sock_filter tmp = {
+ .code = BPF_LD | BPF_ABS | BPF_B,
+ .k = fp->k,
+ };
+
+ *seen_ld_abs = true;
+
+ /* X = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ /* A = BPF_R0 = *(u8 *) (skb->data + K) */
+ convert_bpf_ld_abs(&tmp, &insn);
+ insn++;
+ /* A &= 0xf */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
+ /* A <<= 2 */
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
+ /* tmp = X */
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
+ /* X = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ /* A = tmp */
+ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
+ break;
+ }
+ /* RET_K is remaped into 2 insns. RET_A case doesn't need an
+ * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
+ */
+ case BPF_RET | BPF_A:
+ case BPF_RET | BPF_K:
+ if (BPF_RVAL(fp->code) == BPF_K)
+ *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
+ 0, fp->k);
+ *insn = BPF_EXIT_INSN();
+ break;
+
+ /* Store to stack. */
+ case BPF_ST:
+ case BPF_STX:
+ stack_off = fp->k * 4 + 4;
+ *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
+ BPF_ST ? BPF_REG_A : BPF_REG_X,
+ -stack_off);
+ /* check_load_and_stores() verifies that classic BPF can
+ * load from stack only after write, so tracking
+ * stack_depth for ST|STX insns is enough
+ */
+ if (new_prog && new_prog->aux->stack_depth < stack_off)
+ new_prog->aux->stack_depth = stack_off;
+ break;
+
+ /* Load from stack. */
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ stack_off = fp->k * 4 + 4;
+ *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, BPF_REG_FP,
+ -stack_off);
+ break;
+
+ /* A = K or X = K */
+ case BPF_LD | BPF_IMM:
+ case BPF_LDX | BPF_IMM:
+ *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, fp->k);
+ break;
+
+ /* X = A */
+ case BPF_MISC | BPF_TAX:
+ *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ break;
+
+ /* A = X */
+ case BPF_MISC | BPF_TXA:
+ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
+ break;
+
+ /* A = skb->len or X = skb->len */
+ case BPF_LD | BPF_W | BPF_LEN:
+ case BPF_LDX | BPF_W | BPF_LEN:
+ *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
+ offsetof(struct sk_buff, len));
+ break;
+
+ /* Access seccomp_data fields. */
+ case BPF_LDX | BPF_ABS | BPF_W:
+ /* A = *(u32 *) (ctx + K) */
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
+ break;
+
+ /* Unknown instruction. */
+ default:
+ goto err;
+ }
+
+ insn++;
+ if (new_prog)
+ memcpy(new_insn, tmp_insns,
+ sizeof(*insn) * (insn - tmp_insns));
+ new_insn += insn - tmp_insns;
+ }
+
+ if (!new_prog) {
+ /* Only calculating new length. */
+ *new_len = new_insn - first_insn;
+ if (*seen_ld_abs)
+ *new_len += 4; /* Prologue bits. */
+ return 0;
+ }
+
+ pass++;
+ if (new_flen != new_insn - first_insn) {
+ new_flen = new_insn - first_insn;
+ if (pass > 2)
+ goto err;
+ goto do_pass;
+ }
+
+ kfree(addrs);
+ BUG_ON(*new_len != new_flen);
+ return 0;
+err:
+ kfree(addrs);
+ return -EINVAL;
+}
+
+/* Security:
+ *
+ * As we dont want to clear mem[] array for each packet going through
+ * __bpf_prog_run(), we check that filter loaded by user never try to read
+ * a cell if not previously written, and we check all branches to be sure
+ * a malicious user doesn't try to abuse us.
+ */
+static int check_load_and_stores(const struct sock_filter *filter, int flen)
+{
+ u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
+ int pc, ret = 0;
+
+ BUILD_BUG_ON(BPF_MEMWORDS > 16);
+
+ masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
+ if (!masks)
+ return -ENOMEM;
+
+ memset(masks, 0xff, flen * sizeof(*masks));
+
+ for (pc = 0; pc < flen; pc++) {
+ memvalid &= masks[pc];
+
+ switch (filter[pc].code) {
+ case BPF_ST:
+ case BPF_STX:
+ memvalid |= (1 << filter[pc].k);
+ break;
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ if (!(memvalid & (1 << filter[pc].k))) {
+ ret = -EINVAL;
+ goto error;
+ }
+ break;
+ case BPF_JMP | BPF_JA:
+ /* A jump must set masks on target */
+ masks[pc + 1 + filter[pc].k] &= memvalid;
+ memvalid = ~0;
+ break;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* A jump must set masks on targets */
+ masks[pc + 1 + filter[pc].jt] &= memvalid;
+ masks[pc + 1 + filter[pc].jf] &= memvalid;
+ memvalid = ~0;
+ break;
+ }
+ }
+error:
+ kfree(masks);
+ return ret;
+}
+
+static bool chk_code_allowed(u16 code_to_probe)
+{
+ static const bool codes[] = {
+ /* 32 bit ALU operations */
+ [BPF_ALU | BPF_ADD | BPF_K] = true,
+ [BPF_ALU | BPF_ADD | BPF_X] = true,
+ [BPF_ALU | BPF_SUB | BPF_K] = true,
+ [BPF_ALU | BPF_SUB | BPF_X] = true,
+ [BPF_ALU | BPF_MUL | BPF_K] = true,
+ [BPF_ALU | BPF_MUL | BPF_X] = true,
+ [BPF_ALU | BPF_DIV | BPF_K] = true,
+ [BPF_ALU | BPF_DIV | BPF_X] = true,
+ [BPF_ALU | BPF_MOD | BPF_K] = true,
+ [BPF_ALU | BPF_MOD | BPF_X] = true,
+ [BPF_ALU | BPF_AND | BPF_K] = true,
+ [BPF_ALU | BPF_AND | BPF_X] = true,
+ [BPF_ALU | BPF_OR | BPF_K] = true,
+ [BPF_ALU | BPF_OR | BPF_X] = true,
+ [BPF_ALU | BPF_XOR | BPF_K] = true,
+ [BPF_ALU | BPF_XOR | BPF_X] = true,
+ [BPF_ALU | BPF_LSH | BPF_K] = true,
+ [BPF_ALU | BPF_LSH | BPF_X] = true,
+ [BPF_ALU | BPF_RSH | BPF_K] = true,
+ [BPF_ALU | BPF_RSH | BPF_X] = true,
+ [BPF_ALU | BPF_NEG] = true,
+ /* Load instructions */
+ [BPF_LD | BPF_W | BPF_ABS] = true,
+ [BPF_LD | BPF_H | BPF_ABS] = true,
+ [BPF_LD | BPF_B | BPF_ABS] = true,
+ [BPF_LD | BPF_W | BPF_LEN] = true,
+ [BPF_LD | BPF_W | BPF_IND] = true,
+ [BPF_LD | BPF_H | BPF_IND] = true,
+ [BPF_LD | BPF_B | BPF_IND] = true,
+ [BPF_LD | BPF_IMM] = true,
+ [BPF_LD | BPF_MEM] = true,
+ [BPF_LDX | BPF_W | BPF_LEN] = true,
+ [BPF_LDX | BPF_B | BPF_MSH] = true,
+ [BPF_LDX | BPF_IMM] = true,
+ [BPF_LDX | BPF_MEM] = true,
+ /* Store instructions */
+ [BPF_ST] = true,
+ [BPF_STX] = true,
+ /* Misc instructions */
+ [BPF_MISC | BPF_TAX] = true,
+ [BPF_MISC | BPF_TXA] = true,
+ /* Return instructions */
+ [BPF_RET | BPF_K] = true,
+ [BPF_RET | BPF_A] = true,
+ /* Jump instructions */
+ [BPF_JMP | BPF_JA] = true,
+ [BPF_JMP | BPF_JEQ | BPF_K] = true,
+ [BPF_JMP | BPF_JEQ | BPF_X] = true,
+ [BPF_JMP | BPF_JGE | BPF_K] = true,
+ [BPF_JMP | BPF_JGE | BPF_X] = true,
+ [BPF_JMP | BPF_JGT | BPF_K] = true,
+ [BPF_JMP | BPF_JGT | BPF_X] = true,
+ [BPF_JMP | BPF_JSET | BPF_K] = true,
+ [BPF_JMP | BPF_JSET | BPF_X] = true,
+ };
+
+ if (code_to_probe >= ARRAY_SIZE(codes))
+ return false;
+
+ return codes[code_to_probe];
+}
+
+static bool bpf_check_basics_ok(const struct sock_filter *filter,
+ unsigned int flen)
+{
+ if (filter == NULL)
+ return false;
+ if (flen == 0 || flen > BPF_MAXINSNS)
+ return false;
+
+ return true;
+}
+
+/**
+ * bpf_check_classic - verify socket filter code
+ * @filter: filter to verify
+ * @flen: length of filter
+ *
+ * Check the user's filter code. If we let some ugly
+ * filter code slip through kaboom! The filter must contain
+ * no references or jumps that are out of range, no illegal
+ * instructions, and must end with a RET instruction.
+ *
+ * All jumps are forward as they are not signed.
+ *
+ * Returns 0 if the rule set is legal or -EINVAL if not.
+ */
+static int bpf_check_classic(const struct sock_filter *filter,
+ unsigned int flen)
+{
+ bool anc_found;
+ int pc;
+
+ /* Check the filter code now */
+ for (pc = 0; pc < flen; pc++) {
+ const struct sock_filter *ftest = &filter[pc];
+
+ /* May we actually operate on this code? */
+ if (!chk_code_allowed(ftest->code))
+ return -EINVAL;
+
+ /* Some instructions need special checks */
+ switch (ftest->code) {
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ /* Check for division by zero */
+ if (ftest->k == 0)
+ return -EINVAL;
+ break;
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ if (ftest->k >= 32)
+ return -EINVAL;
+ break;
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ case BPF_ST:
+ case BPF_STX:
+ /* Check for invalid memory addresses */
+ if (ftest->k >= BPF_MEMWORDS)
+ return -EINVAL;
+ break;
+ case BPF_JMP | BPF_JA:
+ /* Note, the large ftest->k might cause loops.
+ * Compare this with conditional jumps below,
+ * where offsets are limited. --ANK (981016)
+ */
+ if (ftest->k >= (unsigned int)(flen - pc - 1))
+ return -EINVAL;
+ break;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* Both conditionals must be safe */
+ if (pc + ftest->jt + 1 >= flen ||
+ pc + ftest->jf + 1 >= flen)
+ return -EINVAL;
+ break;
+ case BPF_LD | BPF_W | BPF_ABS:
+ case BPF_LD | BPF_H | BPF_ABS:
+ case BPF_LD | BPF_B | BPF_ABS:
+ anc_found = false;
+ if (bpf_anc_helper(ftest) & BPF_ANC)
+ anc_found = true;
+ /* Ancillary operation unknown or unsupported */
+ if (anc_found == false && ftest->k >= SKF_AD_OFF)
+ return -EINVAL;
+ }
+ }
+
+ /* Last instruction must be a RET code */
+ switch (filter[flen - 1].code) {
+ case BPF_RET | BPF_K:
+ case BPF_RET | BPF_A:
+ return check_load_and_stores(filter, flen);
+ }
+
+ return -EINVAL;
+}
+
+static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
+ const struct sock_fprog *fprog)
+{
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ struct sock_fprog_kern *fkprog;
+
+ fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
+ if (!fp->orig_prog)
+ return -ENOMEM;
+
+ fkprog = fp->orig_prog;
+ fkprog->len = fprog->len;
+
+ fkprog->filter = kmemdup(fp->insns, fsize,
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!fkprog->filter) {
+ kfree(fp->orig_prog);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void bpf_release_orig_filter(struct bpf_prog *fp)
+{
+ struct sock_fprog_kern *fprog = fp->orig_prog;
+
+ if (fprog) {
+ kfree(fprog->filter);
+ kfree(fprog);
+ }
+}
+
+static void __bpf_prog_release(struct bpf_prog *prog)
+{
+ if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
+ bpf_prog_put(prog);
+ } else {
+ bpf_release_orig_filter(prog);
+ bpf_prog_free(prog);
+ }
+}
+
+static void __sk_filter_release(struct sk_filter *fp)
+{
+ __bpf_prog_release(fp->prog);
+ kfree(fp);
+}
+
+/**
+ * sk_filter_release_rcu - Release a socket filter by rcu_head
+ * @rcu: rcu_head that contains the sk_filter to free
+ */
+static void sk_filter_release_rcu(struct rcu_head *rcu)
+{
+ struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
+
+ __sk_filter_release(fp);
+}
+
+/**
+ * sk_filter_release - release a socket filter
+ * @fp: filter to remove
+ *
+ * Remove a filter from a socket and release its resources.
+ */
+static void sk_filter_release(struct sk_filter *fp)
+{
+ if (refcount_dec_and_test(&fp->refcnt))
+ call_rcu(&fp->rcu, sk_filter_release_rcu);
+}
+
+void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
+{
+ u32 filter_size = bpf_prog_size(fp->prog->len);
+
+ atomic_sub(filter_size, &sk->sk_omem_alloc);
+ sk_filter_release(fp);
+}
+
+/* try to charge the socket memory if there is space available
+ * return true on success
+ */
+static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
+{
+ u32 filter_size = bpf_prog_size(fp->prog->len);
+
+ /* same check as in sock_kmalloc() */
+ if (filter_size <= sysctl_optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
+ atomic_add(filter_size, &sk->sk_omem_alloc);
+ return true;
+ }
+ return false;
+}
+
+bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
+{
+ if (!refcount_inc_not_zero(&fp->refcnt))
+ return false;
+
+ if (!__sk_filter_charge(sk, fp)) {
+ sk_filter_release(fp);
+ return false;
+ }
+ return true;
+}
+
+static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
+{
+ struct sock_filter *old_prog;
+ struct bpf_prog *old_fp;
+ int err, new_len, old_len = fp->len;
+ bool seen_ld_abs = false;
+
+ /* We are free to overwrite insns et al right here as it
+ * won't be used at this point in time anymore internally
+ * after the migration to the internal BPF instruction
+ * representation.
+ */
+ BUILD_BUG_ON(sizeof(struct sock_filter) !=
+ sizeof(struct bpf_insn));
+
+ /* Conversion cannot happen on overlapping memory areas,
+ * so we need to keep the user BPF around until the 2nd
+ * pass. At this time, the user BPF is stored in fp->insns.
+ */
+ old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!old_prog) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ /* 1st pass: calculate the new program length. */
+ err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
+ &seen_ld_abs);
+ if (err)
+ goto out_err_free;
+
+ /* Expand fp for appending the new filter representation. */
+ old_fp = fp;
+ fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
+ if (!fp) {
+ /* The old_fp is still around in case we couldn't
+ * allocate new memory, so uncharge on that one.
+ */
+ fp = old_fp;
+ err = -ENOMEM;
+ goto out_err_free;
+ }
+
+ fp->len = new_len;
+
+ /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
+ err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
+ &seen_ld_abs);
+ if (err)
+ /* 2nd bpf_convert_filter() can fail only if it fails
+ * to allocate memory, remapping must succeed. Note,
+ * that at this time old_fp has already been released
+ * by krealloc().
+ */
+ goto out_err_free;
+
+ fp = bpf_prog_select_runtime(fp, &err);
+ if (err)
+ goto out_err_free;
+
+ kfree(old_prog);
+ return fp;
+
+out_err_free:
+ kfree(old_prog);
+out_err:
+ __bpf_prog_release(fp);
+ return ERR_PTR(err);
+}
+
+static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
+ bpf_aux_classic_check_t trans)
+{
+ int err;
+
+ fp->bpf_func = NULL;
+ fp->jited = 0;
+
+ err = bpf_check_classic(fp->insns, fp->len);
+ if (err) {
+ __bpf_prog_release(fp);
+ return ERR_PTR(err);
+ }
+
+ /* There might be additional checks and transformations
+ * needed on classic filters, f.e. in case of seccomp.
+ */
+ if (trans) {
+ err = trans(fp->insns, fp->len);
+ if (err) {
+ __bpf_prog_release(fp);
+ return ERR_PTR(err);
+ }
+ }
+
+ /* Probe if we can JIT compile the filter and if so, do
+ * the compilation of the filter.
+ */
+ bpf_jit_compile(fp);
+
+ /* JIT compiler couldn't process this filter, so do the
+ * internal BPF translation for the optimized interpreter.
+ */
+ if (!fp->jited)
+ fp = bpf_migrate_filter(fp);
+
+ return fp;
+}
+
+/**
+ * bpf_prog_create - create an unattached filter
+ * @pfp: the unattached filter that is created
+ * @fprog: the filter program
+ *
+ * Create a filter independent of any socket. We first run some
+ * sanity checks on it to make sure it does not explode on us later.
+ * If an error occurs or there is insufficient memory for the filter
+ * a negative errno code is returned. On success the return is zero.
+ */
+int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
+{
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ struct bpf_prog *fp;
+
+ /* Make sure new filter is there and in the right amounts. */
+ if (!bpf_check_basics_ok(fprog->filter, fprog->len))
+ return -EINVAL;
+
+ fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
+ if (!fp)
+ return -ENOMEM;
+
+ memcpy(fp->insns, fprog->filter, fsize);
+
+ fp->len = fprog->len;
+ /* Since unattached filters are not copied back to user
+ * space through sk_get_filter(), we do not need to hold
+ * a copy here, and can spare us the work.
+ */
+ fp->orig_prog = NULL;
+
+ /* bpf_prepare_filter() already takes care of freeing
+ * memory in case something goes wrong.
+ */
+ fp = bpf_prepare_filter(fp, NULL);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
+ *pfp = fp;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_create);
+
+/**
+ * bpf_prog_create_from_user - create an unattached filter from user buffer
+ * @pfp: the unattached filter that is created
+ * @fprog: the filter program
+ * @trans: post-classic verifier transformation handler
+ * @save_orig: save classic BPF program
+ *
+ * This function effectively does the same as bpf_prog_create(), only
+ * that it builds up its insns buffer from user space provided buffer.
+ * It also allows for passing a bpf_aux_classic_check_t handler.
+ */
+int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
+ bpf_aux_classic_check_t trans, bool save_orig)
+{
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ struct bpf_prog *fp;
+ int err;
+
+ /* Make sure new filter is there and in the right amounts. */
+ if (!bpf_check_basics_ok(fprog->filter, fprog->len))
+ return -EINVAL;
+
+ fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
+ if (!fp)
+ return -ENOMEM;
+
+ if (copy_from_user(fp->insns, fprog->filter, fsize)) {
+ __bpf_prog_free(fp);
+ return -EFAULT;
+ }
+
+ fp->len = fprog->len;
+ fp->orig_prog = NULL;
+
+ if (save_orig) {
+ err = bpf_prog_store_orig_filter(fp, fprog);
+ if (err) {
+ __bpf_prog_free(fp);
+ return -ENOMEM;
+ }
+ }
+
+ /* bpf_prepare_filter() already takes care of freeing
+ * memory in case something goes wrong.
+ */
+ fp = bpf_prepare_filter(fp, trans);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
+ *pfp = fp;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);
+
+void bpf_prog_destroy(struct bpf_prog *fp)
+{
+ __bpf_prog_release(fp);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_destroy);
+
+static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
+{
+ struct sk_filter *fp, *old_fp;
+
+ fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+ if (!fp)
+ return -ENOMEM;
+
+ fp->prog = prog;
+
+ if (!__sk_filter_charge(sk, fp)) {
+ kfree(fp);
+ return -ENOMEM;
+ }
+ refcount_set(&fp->refcnt, 1);
+
+ old_fp = rcu_dereference_protected(sk->sk_filter,
+ lockdep_sock_is_held(sk));
+ rcu_assign_pointer(sk->sk_filter, fp);
+
+ if (old_fp)
+ sk_filter_uncharge(sk, old_fp);
+
+ return 0;
+}
+
+static
+struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
+{
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ struct bpf_prog *prog;
+ int err;
+
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return ERR_PTR(-EPERM);
+
+ /* Make sure new filter is there and in the right amounts. */
+ if (!bpf_check_basics_ok(fprog->filter, fprog->len))
+ return ERR_PTR(-EINVAL);
+
+ prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
+ if (!prog)
+ return ERR_PTR(-ENOMEM);
+
+ if (copy_from_user(prog->insns, fprog->filter, fsize)) {
+ __bpf_prog_free(prog);
+ return ERR_PTR(-EFAULT);
+ }
+
+ prog->len = fprog->len;
+
+ err = bpf_prog_store_orig_filter(prog, fprog);
+ if (err) {
+ __bpf_prog_free(prog);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* bpf_prepare_filter() already takes care of freeing
+ * memory in case something goes wrong.
+ */
+ return bpf_prepare_filter(prog, NULL);
+}
+
+/**
+ * sk_attach_filter - attach a socket filter
+ * @fprog: the filter program
+ * @sk: the socket to use
+ *
+ * Attach the user's filter code. We first run some sanity checks on
+ * it to make sure it does not explode on us later. If an error
+ * occurs or there is insufficient memory for the filter a negative
+ * errno code is returned. On success the return is zero.
+ */
+int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+{
+ struct bpf_prog *prog = __get_filter(fprog, sk);
+ int err;
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = __sk_attach_prog(prog, sk);
+ if (err < 0) {
+ __bpf_prog_release(prog);
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sk_attach_filter);
+
+int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+{
+ struct bpf_prog *prog = __get_filter(fprog, sk);
+ int err;
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (bpf_prog_size(prog->len) > sysctl_optmem_max)
+ err = -ENOMEM;
+ else
+ err = reuseport_attach_prog(sk, prog);
+
+ if (err)
+ __bpf_prog_release(prog);
+
+ return err;
+}
+
+static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
+{
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return ERR_PTR(-EPERM);
+
+ return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
+}
+
+int sk_attach_bpf(u32 ufd, struct sock *sk)
+{
+ struct bpf_prog *prog = __get_bpf(ufd, sk);
+ int err;
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = __sk_attach_prog(prog, sk);
+ if (err < 0) {
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ return 0;
+}
+
+int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
+{
+ struct bpf_prog *prog;
+ int err;
+
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
+ prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
+ if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL)
+ prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) {
+ /* Like other non BPF_PROG_TYPE_SOCKET_FILTER
+ * bpf prog (e.g. sockmap). It depends on the
+ * limitation imposed by bpf_prog_load().
+ * Hence, sysctl_optmem_max is not checked.
+ */
+ if ((sk->sk_type != SOCK_STREAM &&
+ sk->sk_type != SOCK_DGRAM) ||
+ (sk->sk_protocol != IPPROTO_UDP &&
+ sk->sk_protocol != IPPROTO_TCP) ||
+ (sk->sk_family != AF_INET &&
+ sk->sk_family != AF_INET6)) {
+ err = -ENOTSUPP;
+ goto err_prog_put;
+ }
+ } else {
+ /* BPF_PROG_TYPE_SOCKET_FILTER */
+ if (bpf_prog_size(prog->len) > sysctl_optmem_max) {
+ err = -ENOMEM;
+ goto err_prog_put;
+ }
+ }
+
+ err = reuseport_attach_prog(sk, prog);
+err_prog_put:
+ if (err)
+ bpf_prog_put(prog);
+
+ return err;
+}
+
+void sk_reuseport_prog_free(struct bpf_prog *prog)
+{
+ if (!prog)
+ return;
+
+ if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
+ bpf_prog_put(prog);
+ else
+ bpf_prog_destroy(prog);
+}
+
+struct bpf_scratchpad {
+ union {
+ __be32 diff[MAX_BPF_STACK / sizeof(__be32)];
+ u8 buff[MAX_BPF_STACK];
+ };
+};
+
+static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
+
+static inline int __bpf_try_make_writable(struct sk_buff *skb,
+ unsigned int write_len)
+{
+ return skb_ensure_writable(skb, write_len);
+}
+
+static inline int bpf_try_make_writable(struct sk_buff *skb,
+ unsigned int write_len)
+{
+ int err = __bpf_try_make_writable(skb, write_len);
+
+ bpf_compute_data_pointers(skb);
+ return err;
+}
+
+static int bpf_try_make_head_writable(struct sk_buff *skb)
+{
+ return bpf_try_make_writable(skb, skb_headlen(skb));
+}
+
+static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
+{
+ if (skb_at_tc_ingress(skb))
+ skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
+}
+
+static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
+{
+ if (skb_at_tc_ingress(skb))
+ skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
+}
+
+BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
+ const void *, from, u32, len, u64, flags)
+{
+ void *ptr;
+
+ if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
+ return -EINVAL;
+ if (unlikely(offset > INT_MAX))
+ return -EFAULT;
+ if (unlikely(bpf_try_make_writable(skb, offset + len)))
+ return -EFAULT;
+
+ ptr = skb->data + offset;
+ if (flags & BPF_F_RECOMPUTE_CSUM)
+ __skb_postpull_rcsum(skb, ptr, len, offset);
+
+ memcpy(ptr, from, len);
+
+ if (flags & BPF_F_RECOMPUTE_CSUM)
+ __skb_postpush_rcsum(skb, ptr, len, offset);
+ if (flags & BPF_F_INVALIDATE_HASH)
+ skb_clear_hash(skb);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
+ .func = bpf_skb_store_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
+ void *, to, u32, len)
+{
+ void *ptr;
+
+ if (unlikely(offset > INT_MAX))
+ goto err_clear;
+
+ ptr = skb_header_pointer(skb, offset, len, to);
+ if (unlikely(!ptr))
+ goto err_clear;
+ if (ptr != to)
+ memcpy(to, ptr, len);
+
+ return 0;
+err_clear:
+ memset(to, 0, len);
+ return -EFAULT;
+}
+
+static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
+ .func = bpf_skb_load_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
+ u32, offset, void *, to, u32, len, u32, start_header)
+{
+ u8 *end = skb_tail_pointer(skb);
+ u8 *start, *ptr;
+
+ if (unlikely(offset > 0xffff))
+ goto err_clear;
+
+ switch (start_header) {
+ case BPF_HDR_START_MAC:
+ if (unlikely(!skb_mac_header_was_set(skb)))
+ goto err_clear;
+ start = skb_mac_header(skb);
+ break;
+ case BPF_HDR_START_NET:
+ start = skb_network_header(skb);
+ break;
+ default:
+ goto err_clear;
+ }
+
+ ptr = start + offset;
+
+ if (likely(ptr + len <= end)) {
+ memcpy(to, ptr, len);
+ return 0;
+ }
+
+err_clear:
+ memset(to, 0, len);
+ return -EFAULT;
+}
+
+static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
+ .func = bpf_skb_load_bytes_relative,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+ /* Idea is the following: should the needed direct read/write
+ * test fail during runtime, we can pull in more data and redo
+ * again, since implicitly, we invalidate previous checks here.
+ *
+ * Or, since we know how much we need to make read/writeable,
+ * this can be done once at the program beginning for direct
+ * access case. By this we overcome limitations of only current
+ * headroom being accessible.
+ */
+ return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
+}
+
+static const struct bpf_func_proto bpf_skb_pull_data_proto = {
+ .func = bpf_skb_pull_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+static inline int sk_skb_try_make_writable(struct sk_buff *skb,
+ unsigned int write_len)
+{
+ int err = __bpf_try_make_writable(skb, write_len);
+
+ bpf_compute_data_end_sk_skb(skb);
+ return err;
+}
+
+BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+ /* Idea is the following: should the needed direct read/write
+ * test fail during runtime, we can pull in more data and redo
+ * again, since implicitly, we invalidate previous checks here.
+ *
+ * Or, since we know how much we need to make read/writeable,
+ * this can be done once at the program beginning for direct
+ * access case. By this we overcome limitations of only current
+ * headroom being accessible.
+ */
+ return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb));
+}
+
+static const struct bpf_func_proto sk_skb_pull_data_proto = {
+ .func = sk_skb_pull_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
+ u64, from, u64, to, u64, flags)
+{
+ __sum16 *ptr;
+
+ if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
+ return -EINVAL;
+ if (unlikely(offset > 0xffff || offset & 1))
+ return -EFAULT;
+ if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
+ return -EFAULT;
+
+ ptr = (__sum16 *)(skb->data + offset);
+ switch (flags & BPF_F_HDR_FIELD_MASK) {
+ case 0:
+ if (unlikely(from != 0))
+ return -EINVAL;
+
+ csum_replace_by_diff(ptr, to);
+ break;
+ case 2:
+ csum_replace2(ptr, from, to);
+ break;
+ case 4:
+ csum_replace4(ptr, from, to);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
+ .func = bpf_l3_csum_replace,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
+ u64, from, u64, to, u64, flags)
+{
+ bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
+ bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
+ bool do_mforce = flags & BPF_F_MARK_ENFORCE;
+ __sum16 *ptr;
+
+ if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
+ BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
+ return -EINVAL;
+ if (unlikely(offset > 0xffff || offset & 1))
+ return -EFAULT;
+ if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
+ return -EFAULT;
+
+ ptr = (__sum16 *)(skb->data + offset);
+ if (is_mmzero && !do_mforce && !*ptr)
+ return 0;
+
+ switch (flags & BPF_F_HDR_FIELD_MASK) {
+ case 0:
+ if (unlikely(from != 0))
+ return -EINVAL;
+
+ inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
+ break;
+ case 2:
+ inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
+ break;
+ case 4:
+ inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (is_mmzero && !*ptr)
+ *ptr = CSUM_MANGLED_0;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
+ .func = bpf_l4_csum_replace,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
+ __be32 *, to, u32, to_size, __wsum, seed)
+{
+ struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
+ u32 diff_size = from_size + to_size;
+ int i, j = 0;
+
+ /* This is quite flexible, some examples:
+ *
+ * from_size == 0, to_size > 0, seed := csum --> pushing data
+ * from_size > 0, to_size == 0, seed := csum --> pulling data
+ * from_size > 0, to_size > 0, seed := 0 --> diffing data
+ *
+ * Even for diffing, from_size and to_size don't need to be equal.
+ */
+ if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
+ diff_size > sizeof(sp->diff)))
+ return -EINVAL;
+
+ for (i = 0; i < from_size / sizeof(__be32); i++, j++)
+ sp->diff[j] = ~from[i];
+ for (i = 0; i < to_size / sizeof(__be32); i++, j++)
+ sp->diff[j] = to[i];
+
+ return csum_partial(sp->diff, diff_size, seed);
+}
+
+static const struct bpf_func_proto bpf_csum_diff_proto = {
+ .func = bpf_csum_diff,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg4_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
+{
+ /* The interface is to be used in combination with bpf_csum_diff()
+ * for direct packet writes. csum rotation for alignment as well
+ * as emulating csum_sub() can be done from the eBPF program.
+ */
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ return (skb->csum = csum_add(skb->csum, csum));
+
+ return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_csum_update_proto = {
+ .func = bpf_csum_update,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ return dev_forward_skb(dev, skb);
+}
+
+static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
+ struct sk_buff *skb)
+{
+ int ret = ____dev_forward_skb(dev, skb);
+
+ if (likely(!ret)) {
+ skb->dev = dev;
+ ret = netif_rx(skb);
+ }
+
+ return ret;
+}
+
+static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ int ret;
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
+ kfree_skb(skb);
+ return -ENETDOWN;
+ }
+
+ skb->dev = dev;
+ skb->tstamp = 0;
+
+ dev_xmit_recursion_inc();
+ ret = dev_queue_xmit(skb);
+ dev_xmit_recursion_dec();
+
+ return ret;
+}
+
+static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
+ u32 flags)
+{
+ unsigned int mlen = skb_network_offset(skb);
+
+ if (mlen) {
+ __skb_pull(skb, mlen);
+
+ /* At ingress, the mac header has already been pulled once.
+ * At egress, skb_pospull_rcsum has to be done in case that
+ * the skb is originated from ingress (i.e. a forwarded skb)
+ * to ensure that rcsum starts at net header.
+ */
+ if (!skb_at_tc_ingress(skb))
+ skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
+ }
+ skb_pop_mac_header(skb);
+ skb_reset_mac_len(skb);
+ return flags & BPF_F_INGRESS ?
+ __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
+}
+
+static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
+ u32 flags)
+{
+ /* Verify that a link layer header is carried */
+ if (unlikely(skb->mac_header >= skb->network_header)) {
+ kfree_skb(skb);
+ return -ERANGE;
+ }
+
+ bpf_push_mac_rcsum(skb);
+ return flags & BPF_F_INGRESS ?
+ __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+}
+
+static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
+ u32 flags)
+{
+ if (dev_is_mac_header_xmit(dev))
+ return __bpf_redirect_common(skb, dev, flags);
+ else
+ return __bpf_redirect_no_mac(skb, dev, flags);
+}
+
+BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
+{
+ struct net_device *dev;
+ struct sk_buff *clone;
+ int ret;
+
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return -EINVAL;
+
+ dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
+ if (unlikely(!dev))
+ return -EINVAL;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!clone))
+ return -ENOMEM;
+
+ /* For direct write, we need to keep the invariant that the skbs
+ * we're dealing with need to be uncloned. Should uncloning fail
+ * here, we need to free the just generated clone to unclone once
+ * again.
+ */
+ ret = bpf_try_make_head_writable(skb);
+ if (unlikely(ret)) {
+ kfree_skb(clone);
+ return -ENOMEM;
+ }
+
+ return __bpf_redirect(clone, dev, flags);
+}
+
+static const struct bpf_func_proto bpf_clone_redirect_proto = {
+ .func = bpf_clone_redirect,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
+EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
+
+BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return TC_ACT_SHOT;
+
+ ri->ifindex = ifindex;
+ ri->flags = flags;
+
+ return TC_ACT_REDIRECT;
+}
+
+int skb_do_redirect(struct sk_buff *skb)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct net_device *dev;
+
+ dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
+ ri->ifindex = 0;
+ if (unlikely(!dev)) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return __bpf_redirect(skb, dev, ri->flags);
+}
+
+static const struct bpf_func_proto bpf_redirect_proto = {
+ .func = bpf_redirect,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ /* If user passes invalid input drop the packet. */
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ tcb->bpf.flags = flags;
+ tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+ if (!tcb->bpf.sk_redir)
+ return SK_DROP;
+
+ return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+ .func = bpf_sk_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
+ struct bpf_map *, map, u32, key, u64, flags)
+{
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ /* If user passes invalid input drop the packet. */
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ tcb->bpf.flags = flags;
+ tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
+ if (!tcb->bpf.sk_redir)
+ return SK_DROP;
+
+ return SK_PASS;
+}
+
+struct sock *do_sk_redirect_map(struct sk_buff *skb)
+{
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ return tcb->bpf.sk_redir;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
+ .func = bpf_sk_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ /* If user passes invalid input drop the packet. */
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ msg->flags = flags;
+ msg->sk_redir = __sock_hash_lookup_elem(map, key);
+ if (!msg->sk_redir)
+ return SK_DROP;
+
+ return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+ .func = bpf_msg_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
+ struct bpf_map *, map, u32, key, u64, flags)
+{
+ /* If user passes invalid input drop the packet. */
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ msg->flags = flags;
+ msg->sk_redir = __sock_map_lookup_elem(map, key);
+ if (!msg->sk_redir)
+ return SK_DROP;
+
+ return SK_PASS;
+}
+
+struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
+{
+ return msg->sk_redir;
+}
+
+static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
+ .func = bpf_msg_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes)
+{
+ msg->apply_bytes = bytes;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
+ .func = bpf_msg_apply_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes)
+{
+ msg->cork_bytes = bytes;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
+ .func = bpf_msg_cork_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+#define sk_msg_iter_var(var) \
+ do { \
+ var++; \
+ if (var == MAX_SKB_FRAGS) \
+ var = 0; \
+ } while (0)
+
+BPF_CALL_4(bpf_msg_pull_data,
+ struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
+{
+ unsigned int len = 0, offset = 0, copy = 0, poffset = 0;
+ int bytes = end - start, bytes_sg_total;
+ struct scatterlist *sg = msg->sg_data;
+ int first_sg, last_sg, i, shift;
+ unsigned char *p, *to, *from;
+ struct page *page;
+
+ if (unlikely(flags || end <= start))
+ return -EINVAL;
+
+ /* First find the starting scatterlist element */
+ i = msg->sg_start;
+ do {
+ len = sg[i].length;
+ if (start < offset + len)
+ break;
+ offset += len;
+ sk_msg_iter_var(i);
+ } while (i != msg->sg_end);
+
+ if (unlikely(start >= offset + len))
+ return -EINVAL;
+
+ first_sg = i;
+ /* The start may point into the sg element so we need to also
+ * account for the headroom.
+ */
+ bytes_sg_total = start - offset + bytes;
+ if (!msg->sg_copy[i] && bytes_sg_total <= len)
+ goto out;
+
+ /* At this point we need to linearize multiple scatterlist
+ * elements or a single shared page. Either way we need to
+ * copy into a linear buffer exclusively owned by BPF. Then
+ * place the buffer in the scatterlist and fixup the original
+ * entries by removing the entries now in the linear buffer
+ * and shifting the remaining entries. For now we do not try
+ * to copy partial entries to avoid complexity of running out
+ * of sg_entry slots. The downside is reading a single byte
+ * will copy the entire sg entry.
+ */
+ do {
+ copy += sg[i].length;
+ sk_msg_iter_var(i);
+ if (bytes_sg_total <= copy)
+ break;
+ } while (i != msg->sg_end);
+ last_sg = i;
+
+ if (unlikely(bytes_sg_total > copy))
+ return -EINVAL;
+
+ page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
+ get_order(copy));
+ if (unlikely(!page))
+ return -ENOMEM;
+ p = page_address(page);
+
+ i = first_sg;
+ do {
+ from = sg_virt(&sg[i]);
+ len = sg[i].length;
+ to = p + poffset;
+
+ memcpy(to, from, len);
+ poffset += len;
+ sg[i].length = 0;
+ put_page(sg_page(&sg[i]));
+
+ sk_msg_iter_var(i);
+ } while (i != last_sg);
+
+ sg[first_sg].length = copy;
+ sg_set_page(&sg[first_sg], page, copy, 0);
+
+ /* To repair sg ring we need to shift entries. If we only
+ * had a single entry though we can just replace it and
+ * be done. Otherwise walk the ring and shift the entries.
+ */
+ WARN_ON_ONCE(last_sg == first_sg);
+ shift = last_sg > first_sg ?
+ last_sg - first_sg - 1 :
+ MAX_SKB_FRAGS - first_sg + last_sg - 1;
+ if (!shift)
+ goto out;
+
+ i = first_sg;
+ sk_msg_iter_var(i);
+ do {
+ int move_from;
+
+ if (i + shift >= MAX_SKB_FRAGS)
+ move_from = i + shift - MAX_SKB_FRAGS;
+ else
+ move_from = i + shift;
+
+ if (move_from == msg->sg_end)
+ break;
+
+ sg[i] = sg[move_from];
+ sg[move_from].length = 0;
+ sg[move_from].page_link = 0;
+ sg[move_from].offset = 0;
+
+ sk_msg_iter_var(i);
+ } while (1);
+ msg->sg_end -= shift;
+ if (msg->sg_end < 0)
+ msg->sg_end += MAX_SKB_FRAGS;
+out:
+ msg->data = sg_virt(&sg[first_sg]) + start - offset;
+ msg->data_end = msg->data + bytes;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_pull_data_proto = {
+ .func = bpf_msg_pull_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
+{
+ return task_get_classid(skb);
+}
+
+static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
+ .func = bpf_get_cgroup_classid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
+{
+ return dst_tclassid(skb);
+}
+
+static const struct bpf_func_proto bpf_get_route_realm_proto = {
+ .func = bpf_get_route_realm,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
+{
+ /* If skb_clear_hash() was called due to mangling, we can
+ * trigger SW recalculation here. Later access to hash
+ * can then use the inline skb->hash via context directly
+ * instead of calling this helper again.
+ */
+ return skb_get_hash(skb);
+}
+
+static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
+ .func = bpf_get_hash_recalc,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
+{
+ /* After all direct packet write, this can be used once for
+ * triggering a lazy recalc on next skb_get_hash() invocation.
+ */
+ skb_clear_hash(skb);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
+ .func = bpf_set_hash_invalid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash)
+{
+ /* Set user specified hash as L4(+), so that it gets returned
+ * on skb_get_hash() call unless BPF prog later on triggers a
+ * skb_clear_hash().
+ */
+ __skb_set_sw_hash(skb, hash, true);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_set_hash_proto = {
+ .func = bpf_set_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
+ u16, vlan_tci)
+{
+ int ret;
+
+ if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
+ vlan_proto != htons(ETH_P_8021AD)))
+ vlan_proto = htons(ETH_P_8021Q);
+
+ bpf_push_mac_rcsum(skb);
+ ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
+ bpf_pull_mac_rcsum(skb);
+
+ bpf_compute_data_pointers(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
+ .func = bpf_skb_vlan_push,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
+{
+ int ret;
+
+ bpf_push_mac_rcsum(skb);
+ ret = skb_vlan_pop(skb);
+ bpf_pull_mac_rcsum(skb);
+
+ bpf_compute_data_pointers(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
+ .func = bpf_skb_vlan_pop,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
+{
+ /* Caller already did skb_cow() with len as headroom,
+ * so no need to do it here.
+ */
+ skb_push(skb, len);
+ memmove(skb->data, skb->data + len, off);
+ memset(skb->data + off, 0, len);
+
+ /* No skb_postpush_rcsum(skb, skb->data + off, len)
+ * needed here as it does not change the skb->csum
+ * result for checksum complete when summing over
+ * zeroed blocks.
+ */
+ return 0;
+}
+
+static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
+{
+ /* skb_ensure_writable() is not needed here, as we're
+ * already working on an uncloned skb.
+ */
+ if (unlikely(!pskb_may_pull(skb, off + len)))
+ return -ENOMEM;
+
+ skb_postpull_rcsum(skb, skb->data + off, len);
+ memmove(skb->data + len, skb->data, off);
+ __skb_pull(skb, len);
+
+ return 0;
+}
+
+static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
+{
+ bool trans_same = skb->transport_header == skb->network_header;
+ int ret;
+
+ /* There's no need for __skb_push()/__skb_pull() pair to
+ * get to the start of the mac header as we're guaranteed
+ * to always start from here under eBPF.
+ */
+ ret = bpf_skb_generic_push(skb, off, len);
+ if (likely(!ret)) {
+ skb->mac_header -= len;
+ skb->network_header -= len;
+ if (trans_same)
+ skb->transport_header = skb->network_header;
+ }
+
+ return ret;
+}
+
+static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
+{
+ bool trans_same = skb->transport_header == skb->network_header;
+ int ret;
+
+ /* Same here, __skb_push()/__skb_pull() pair not needed. */
+ ret = bpf_skb_generic_pop(skb, off, len);
+ if (likely(!ret)) {
+ skb->mac_header += len;
+ skb->network_header += len;
+ if (trans_same)
+ skb->transport_header = skb->network_header;
+ }
+
+ return ret;
+}
+
+static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
+{
+ const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+ u32 off = skb_mac_header_len(skb);
+ int ret;
+
+ if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
+ return -ENOTSUPP;
+
+ ret = skb_cow(skb, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_push(skb, off, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (skb_is_gso(skb)) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ /* SKB_GSO_TCPV4 needs to be changed into
+ * SKB_GSO_TCPV6.
+ */
+ if (shinfo->gso_type & SKB_GSO_TCPV4) {
+ shinfo->gso_type &= ~SKB_GSO_TCPV4;
+ shinfo->gso_type |= SKB_GSO_TCPV6;
+ }
+
+ /* Header must be checked, and gso_segs recomputed. */
+ shinfo->gso_type |= SKB_GSO_DODGY;
+ shinfo->gso_segs = 0;
+ }
+
+ skb->protocol = htons(ETH_P_IPV6);
+ skb_clear_hash(skb);
+
+ return 0;
+}
+
+static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
+{
+ const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+ u32 off = skb_mac_header_len(skb);
+ int ret;
+
+ if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
+ return -ENOTSUPP;
+
+ ret = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (skb_is_gso(skb)) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ /* SKB_GSO_TCPV6 needs to be changed into
+ * SKB_GSO_TCPV4.
+ */
+ if (shinfo->gso_type & SKB_GSO_TCPV6) {
+ shinfo->gso_type &= ~SKB_GSO_TCPV6;
+ shinfo->gso_type |= SKB_GSO_TCPV4;
+ }
+
+ /* Header must be checked, and gso_segs recomputed. */
+ shinfo->gso_type |= SKB_GSO_DODGY;
+ shinfo->gso_segs = 0;
+ }
+
+ skb->protocol = htons(ETH_P_IP);
+ skb_clear_hash(skb);
+
+ return 0;
+}
+
+static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
+{
+ __be16 from_proto = skb_protocol(skb, true);
+
+ if (from_proto == htons(ETH_P_IP) &&
+ to_proto == htons(ETH_P_IPV6))
+ return bpf_skb_proto_4_to_6(skb);
+
+ if (from_proto == htons(ETH_P_IPV6) &&
+ to_proto == htons(ETH_P_IP))
+ return bpf_skb_proto_6_to_4(skb);
+
+ return -ENOTSUPP;
+}
+
+BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
+ u64, flags)
+{
+ int ret;
+
+ if (unlikely(flags))
+ return -EINVAL;
+
+ /* General idea is that this helper does the basic groundwork
+ * needed for changing the protocol, and eBPF program fills the
+ * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
+ * and other helpers, rather than passing a raw buffer here.
+ *
+ * The rationale is to keep this minimal and without a need to
+ * deal with raw packet data. F.e. even if we would pass buffers
+ * here, the program still needs to call the bpf_lX_csum_replace()
+ * helpers anyway. Plus, this way we keep also separation of
+ * concerns, since f.e. bpf_skb_store_bytes() should only take
+ * care of stores.
+ *
+ * Currently, additional options and extension header space are
+ * not supported, but flags register is reserved so we can adapt
+ * that. For offloads, we mark packet as dodgy, so that headers
+ * need to be verified first.
+ */
+ ret = bpf_skb_proto_xlat(skb, proto);
+ bpf_compute_data_pointers(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_change_proto_proto = {
+ .func = bpf_skb_change_proto,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
+{
+ /* We only allow a restricted subset to be changed for now. */
+ if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
+ !skb_pkt_type_ok(pkt_type)))
+ return -EINVAL;
+
+ skb->pkt_type = pkt_type;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_change_type_proto = {
+ .func = bpf_skb_change_type,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
+{
+ switch (skb_protocol(skb, true)) {
+ case htons(ETH_P_IP):
+ return sizeof(struct iphdr);
+ case htons(ETH_P_IPV6):
+ return sizeof(struct ipv6hdr);
+ default:
+ return ~0U;
+ }
+}
+
+static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff)
+{
+ u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
+ int ret;
+
+ if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
+ return -ENOTSUPP;
+
+ ret = skb_cow(skb, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_push(skb, off, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (skb_is_gso(skb)) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ /* Due to header grow, MSS needs to be downgraded. */
+ skb_decrease_gso_size(shinfo, len_diff);
+ /* Header must be checked, and gso_segs recomputed. */
+ shinfo->gso_type |= SKB_GSO_DODGY;
+ shinfo->gso_segs = 0;
+ }
+
+ return 0;
+}
+
+static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
+{
+ u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
+ int ret;
+
+ if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
+ return -ENOTSUPP;
+
+ ret = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (skb_is_gso(skb)) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ /* Due to header shrink, MSS can be upgraded. */
+ skb_increase_gso_size(shinfo, len_diff);
+ /* Header must be checked, and gso_segs recomputed. */
+ shinfo->gso_type |= SKB_GSO_DODGY;
+ shinfo->gso_segs = 0;
+ }
+
+ return 0;
+}
+
+#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC
+
+static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
+{
+ bool trans_same = skb->transport_header == skb->network_header;
+ u32 len_cur, len_diff_abs = abs(len_diff);
+ u32 len_min = bpf_skb_net_base_len(skb);
+ u32 len_max = BPF_SKB_MAX_LEN;
+ __be16 proto = skb_protocol(skb, true);
+ bool shrink = len_diff < 0;
+ int ret;
+
+ if (unlikely(len_diff_abs > 0xfffU))
+ return -EFAULT;
+ if (unlikely(proto != htons(ETH_P_IP) &&
+ proto != htons(ETH_P_IPV6)))
+ return -ENOTSUPP;
+
+ len_cur = skb->len - skb_network_offset(skb);
+ if (skb_transport_header_was_set(skb) && !trans_same)
+ len_cur = skb_network_header_len(skb);
+ if ((shrink && (len_diff_abs >= len_cur ||
+ len_cur - len_diff_abs < len_min)) ||
+ (!shrink && (skb->len + len_diff_abs > len_max &&
+ !skb_is_gso(skb))))
+ return -ENOTSUPP;
+
+ ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) :
+ bpf_skb_net_grow(skb, len_diff_abs);
+
+ bpf_compute_data_pointers(skb);
+ return ret;
+}
+
+BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
+ u32, mode, u64, flags)
+{
+ if (unlikely(flags))
+ return -EINVAL;
+ if (likely(mode == BPF_ADJ_ROOM_NET))
+ return bpf_skb_adjust_net(skb, len_diff);
+
+ return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
+ .func = bpf_skb_adjust_room,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static u32 __bpf_skb_min_len(const struct sk_buff *skb)
+{
+ u32 min_len = skb_network_offset(skb);
+
+ if (skb_transport_header_was_set(skb))
+ min_len = skb_transport_offset(skb);
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ min_len = skb_checksum_start_offset(skb) +
+ skb->csum_offset + sizeof(__sum16);
+ return min_len;
+}
+
+static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
+{
+ unsigned int old_len = skb->len;
+ int ret;
+
+ ret = __skb_grow_rcsum(skb, new_len);
+ if (!ret)
+ memset(skb->data + old_len, 0, new_len - old_len);
+ return ret;
+}
+
+static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
+{
+ return __skb_trim_rcsum(skb, new_len);
+}
+
+static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
+ u64 flags)
+{
+ u32 max_len = BPF_SKB_MAX_LEN;
+ u32 min_len = __bpf_skb_min_len(skb);
+ int ret;
+
+ if (unlikely(flags || new_len > max_len || new_len < min_len))
+ return -EINVAL;
+ if (skb->encapsulation)
+ return -ENOTSUPP;
+
+ /* The basic idea of this helper is that it's performing the
+ * needed work to either grow or trim an skb, and eBPF program
+ * rewrites the rest via helpers like bpf_skb_store_bytes(),
+ * bpf_lX_csum_replace() and others rather than passing a raw
+ * buffer here. This one is a slow path helper and intended
+ * for replies with control messages.
+ *
+ * Like in bpf_skb_change_proto(), we want to keep this rather
+ * minimal and without protocol specifics so that we are able
+ * to separate concerns as in bpf_skb_store_bytes() should only
+ * be the one responsible for writing buffers.
+ *
+ * It's really expected to be a slow path operation here for
+ * control message replies, so we're implicitly linearizing,
+ * uncloning and drop offloads from the skb by this.
+ */
+ ret = __bpf_try_make_writable(skb, skb->len);
+ if (!ret) {
+ if (new_len > skb->len)
+ ret = bpf_skb_grow_rcsum(skb, new_len);
+ else if (new_len < skb->len)
+ ret = bpf_skb_trim_rcsum(skb, new_len);
+ if (!ret && skb_is_gso(skb))
+ skb_gso_reset(skb);
+ }
+ return ret;
+}
+
+BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
+ u64, flags)
+{
+ int ret = __bpf_skb_change_tail(skb, new_len, flags);
+
+ bpf_compute_data_pointers(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_change_tail_proto = {
+ .func = bpf_skb_change_tail,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
+ u64, flags)
+{
+ int ret = __bpf_skb_change_tail(skb, new_len, flags);
+
+ bpf_compute_data_end_sk_skb(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto sk_skb_change_tail_proto = {
+ .func = sk_skb_change_tail,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
+ u64 flags)
+{
+ u32 max_len = BPF_SKB_MAX_LEN;
+ u32 new_len = skb->len + head_room;
+ int ret;
+
+ if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
+ new_len < skb->len))
+ return -EINVAL;
+
+ ret = skb_cow(skb, head_room);
+ if (likely(!ret)) {
+ /* Idea for this helper is that we currently only
+ * allow to expand on mac header. This means that
+ * skb->protocol network header, etc, stay as is.
+ * Compared to bpf_skb_change_tail(), we're more
+ * flexible due to not needing to linearize or
+ * reset GSO. Intention for this helper is to be
+ * used by an L3 skb that needs to push mac header
+ * for redirection into L2 device.
+ */
+ __skb_push(skb, head_room);
+ memset(skb->data, 0, head_room);
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+ }
+
+ return ret;
+}
+
+BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
+ u64, flags)
+{
+ int ret = __bpf_skb_change_head(skb, head_room, flags);
+
+ bpf_compute_data_pointers(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_change_head_proto = {
+ .func = bpf_skb_change_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
+ u64, flags)
+{
+ int ret = __bpf_skb_change_head(skb, head_room, flags);
+
+ bpf_compute_data_end_sk_skb(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto sk_skb_change_head_proto = {
+ .func = sk_skb_change_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
+{
+ return xdp_data_meta_unsupported(xdp) ? 0 :
+ xdp->data - xdp->data_meta;
+}
+
+BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
+{
+ void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
+ unsigned long metalen = xdp_get_metalen(xdp);
+ void *data_start = xdp_frame_end + metalen;
+ void *data = xdp->data + offset;
+
+ if (unlikely(data < data_start ||
+ data > xdp->data_end - ETH_HLEN))
+ return -EINVAL;
+
+ if (metalen)
+ memmove(xdp->data_meta + offset,
+ xdp->data_meta, metalen);
+ xdp->data_meta += offset;
+ xdp->data = data;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
+ .func = bpf_xdp_adjust_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
+{
+ void *data_end = xdp->data_end + offset;
+
+ /* only shrinking is allowed for now. */
+ if (unlikely(offset >= 0))
+ return -EINVAL;
+
+ if (unlikely(data_end < xdp->data + ETH_HLEN))
+ return -EINVAL;
+
+ xdp->data_end = data_end;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
+ .func = bpf_xdp_adjust_tail,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
+{
+ void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
+ void *meta = xdp->data_meta + offset;
+ unsigned long metalen = xdp->data - meta;
+
+ if (xdp_data_meta_unsupported(xdp))
+ return -ENOTSUPP;
+ if (unlikely(meta < xdp_frame_end ||
+ meta > xdp->data))
+ return -EINVAL;
+ if (unlikely((metalen & (sizeof(__u32) - 1)) ||
+ (metalen > 32)))
+ return -EACCES;
+
+ xdp->data_meta = meta;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
+ .func = bpf_xdp_adjust_meta,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+static int __bpf_tx_xdp(struct net_device *dev,
+ struct bpf_map *map,
+ struct xdp_buff *xdp,
+ u32 index)
+{
+ struct xdp_frame *xdpf;
+ int err, sent;
+
+ if (!dev->netdev_ops->ndo_xdp_xmit) {
+ return -EOPNOTSUPP;
+ }
+
+ err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+ if (unlikely(err))
+ return err;
+
+ xdpf = convert_to_xdp_frame(xdp);
+ if (unlikely(!xdpf))
+ return -EOVERFLOW;
+
+ sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH);
+ if (sent <= 0)
+ return sent;
+ return 0;
+}
+
+static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
+ struct bpf_map *map,
+ struct xdp_buff *xdp,
+ u32 index)
+{
+ int err;
+
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_DEVMAP: {
+ struct bpf_dtab_netdev *dst = fwd;
+
+ err = dev_map_enqueue(dst, xdp, dev_rx);
+ if (err)
+ return err;
+ __dev_map_insert_ctx(map, index);
+ break;
+ }
+ case BPF_MAP_TYPE_CPUMAP: {
+ struct bpf_cpu_map_entry *rcpu = fwd;
+
+ err = cpu_map_enqueue(rcpu, xdp, dev_rx);
+ if (err)
+ return err;
+ __cpu_map_insert_ctx(map, index);
+ break;
+ }
+ case BPF_MAP_TYPE_XSKMAP: {
+ struct xdp_sock *xs = fwd;
+
+ err = __xsk_map_redirect(map, xdp, xs);
+ return err;
+ }
+ default:
+ return -EBADRQC;
+ }
+ return 0;
+}
+
+void xdp_do_flush_map(void)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_map *map = ri->map_to_flush;
+
+ ri->map_to_flush = NULL;
+ if (map) {
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ __dev_map_flush(map);
+ break;
+ case BPF_MAP_TYPE_CPUMAP:
+ __cpu_map_flush(map);
+ break;
+ case BPF_MAP_TYPE_XSKMAP:
+ __xsk_map_flush(map);
+ break;
+ default:
+ break;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(xdp_do_flush_map);
+
+static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
+{
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ return __dev_map_lookup_elem(map, index);
+ case BPF_MAP_TYPE_CPUMAP:
+ return __cpu_map_lookup_elem(map, index);
+ case BPF_MAP_TYPE_XSKMAP:
+ return __xsk_map_lookup_elem(map, index);
+ default:
+ return NULL;
+ }
+}
+
+void bpf_clear_redirect_map(struct bpf_map *map)
+{
+ struct bpf_redirect_info *ri;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ ri = per_cpu_ptr(&bpf_redirect_info, cpu);
+ /* Avoid polluting remote cacheline due to writes if
+ * not needed. Once we pass this test, we need the
+ * cmpxchg() to make sure it hasn't been changed in
+ * the meantime by remote CPU.
+ */
+ if (unlikely(READ_ONCE(ri->map) == map))
+ cmpxchg(&ri->map, map, NULL);
+ }
+}
+
+static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog, struct bpf_map *map)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ u32 index = ri->ifindex;
+ void *fwd = NULL;
+ int err;
+
+ ri->ifindex = 0;
+ WRITE_ONCE(ri->map, NULL);
+
+ fwd = __xdp_map_lookup_elem(map, index);
+ if (!fwd) {
+ err = -EINVAL;
+ goto err;
+ }
+ if (ri->map_to_flush && ri->map_to_flush != map)
+ xdp_do_flush_map();
+
+ err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index);
+ if (unlikely(err))
+ goto err;
+
+ ri->map_to_flush = map;
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+ return 0;
+err:
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+ return err;
+}
+
+int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_map *map = READ_ONCE(ri->map);
+ struct net_device *fwd;
+ u32 index = ri->ifindex;
+ int err;
+
+ if (map)
+ return xdp_do_redirect_map(dev, xdp, xdp_prog, map);
+
+ fwd = dev_get_by_index_rcu(dev_net(dev), index);
+ ri->ifindex = 0;
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
+ if (unlikely(err))
+ goto err;
+
+ _trace_xdp_redirect(dev, xdp_prog, index);
+ return 0;
+err:
+ _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+ return err;
+}
+EXPORT_SYMBOL_GPL(xdp_do_redirect);
+
+static int xdp_do_generic_redirect_map(struct net_device *dev,
+ struct sk_buff *skb,
+ struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog,
+ struct bpf_map *map)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ u32 index = ri->ifindex;
+ void *fwd = NULL;
+ int err = 0;
+
+ ri->ifindex = 0;
+ WRITE_ONCE(ri->map, NULL);
+
+ fwd = __xdp_map_lookup_elem(map, index);
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ struct bpf_dtab_netdev *dst = fwd;
+
+ err = dev_map_generic_redirect(dst, skb, xdp_prog);
+ if (unlikely(err))
+ goto err;
+ } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
+ struct xdp_sock *xs = fwd;
+
+ err = xsk_generic_rcv(xs, xdp);
+ if (err)
+ goto err;
+ consume_skb(skb);
+ } else {
+ /* TODO: Handle BPF_MAP_TYPE_CPUMAP */
+ err = -EBADRQC;
+ goto err;
+ }
+
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+ return 0;
+err:
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+ return err;
+}
+
+int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
+ struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_map *map = READ_ONCE(ri->map);
+ u32 index = ri->ifindex;
+ struct net_device *fwd;
+ int err = 0;
+
+ if (map)
+ return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
+ map);
+ ri->ifindex = 0;
+ fwd = dev_get_by_index_rcu(dev_net(dev), index);
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = xdp_ok_fwd_dev(fwd, skb->len);
+ if (unlikely(err))
+ goto err;
+
+ skb->dev = fwd;
+ _trace_xdp_redirect(dev, xdp_prog, index);
+ generic_xdp_tx(skb, xdp_prog);
+ return 0;
+err:
+ _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+ return err;
+}
+EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
+
+BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+ if (unlikely(flags))
+ return XDP_ABORTED;
+
+ ri->ifindex = ifindex;
+ ri->flags = flags;
+ WRITE_ONCE(ri->map, NULL);
+
+ return XDP_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_xdp_redirect_proto = {
+ .func = bpf_xdp_redirect,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
+ u64, flags)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+ if (unlikely(flags))
+ return XDP_ABORTED;
+
+ ri->ifindex = ifindex;
+ ri->flags = flags;
+ WRITE_ONCE(ri->map, map);
+
+ return XDP_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
+ .func = bpf_xdp_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
+ unsigned long off, unsigned long len)
+{
+ void *ptr = skb_header_pointer(skb, off, len, dst_buff);
+
+ if (unlikely(!ptr))
+ return len;
+ if (ptr != dst_buff)
+ memcpy(dst_buff, ptr, len);
+
+ return 0;
+}
+
+BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
+ u64, flags, void *, meta, u64, meta_size)
+{
+ u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
+
+ if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
+ return -EINVAL;
+ if (unlikely(skb_size > skb->len))
+ return -EFAULT;
+
+ return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
+ bpf_skb_copy);
+}
+
+static const struct bpf_func_proto bpf_skb_event_output_proto = {
+ .func = bpf_skb_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+static unsigned short bpf_tunnel_key_af(u64 flags)
+{
+ return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
+}
+
+BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
+ u32, size, u64, flags)
+{
+ const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ u8 compat[sizeof(struct bpf_tunnel_key)];
+ void *to_orig = to;
+ int err;
+
+ if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
+ err = -EINVAL;
+ goto err_clear;
+ }
+ if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
+ err = -EPROTO;
+ goto err_clear;
+ }
+ if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
+ err = -EINVAL;
+ switch (size) {
+ case offsetof(struct bpf_tunnel_key, tunnel_label):
+ case offsetof(struct bpf_tunnel_key, tunnel_ext):
+ goto set_compat;
+ case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
+ /* Fixup deprecated structure layouts here, so we have
+ * a common path later on.
+ */
+ if (ip_tunnel_info_af(info) != AF_INET)
+ goto err_clear;
+set_compat:
+ to = (struct bpf_tunnel_key *)compat;
+ break;
+ default:
+ goto err_clear;
+ }
+ }
+
+ to->tunnel_id = be64_to_cpu(info->key.tun_id);
+ to->tunnel_tos = info->key.tos;
+ to->tunnel_ttl = info->key.ttl;
+ to->tunnel_ext = 0;
+
+ if (flags & BPF_F_TUNINFO_IPV6) {
+ memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
+ sizeof(to->remote_ipv6));
+ to->tunnel_label = be32_to_cpu(info->key.label);
+ } else {
+ to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+ memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+ to->tunnel_label = 0;
+ }
+
+ if (unlikely(size != sizeof(struct bpf_tunnel_key)))
+ memcpy(to_orig, to, size);
+
+ return 0;
+err_clear:
+ memset(to_orig, 0, size);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
+ .func = bpf_skb_get_tunnel_key,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
+{
+ const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ int err;
+
+ if (unlikely(!info ||
+ !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
+ err = -ENOENT;
+ goto err_clear;
+ }
+ if (unlikely(size < info->options_len)) {
+ err = -ENOMEM;
+ goto err_clear;
+ }
+
+ ip_tunnel_info_opts_get(to, info);
+ if (size > info->options_len)
+ memset(to + info->options_len, 0, size - info->options_len);
+
+ return info->options_len;
+err_clear:
+ memset(to, 0, size);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
+ .func = bpf_skb_get_tunnel_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+static struct metadata_dst __percpu *md_dst;
+
+BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
+ const struct bpf_tunnel_key *, from, u32, size, u64, flags)
+{
+ struct metadata_dst *md = this_cpu_ptr(md_dst);
+ u8 compat[sizeof(struct bpf_tunnel_key)];
+ struct ip_tunnel_info *info;
+
+ if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
+ BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER)))
+ return -EINVAL;
+ if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
+ switch (size) {
+ case offsetof(struct bpf_tunnel_key, tunnel_label):
+ case offsetof(struct bpf_tunnel_key, tunnel_ext):
+ case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
+ /* Fixup deprecated structure layouts here, so we have
+ * a common path later on.
+ */
+ memcpy(compat, from, size);
+ memset(compat + size, 0, sizeof(compat) - size);
+ from = (const struct bpf_tunnel_key *) compat;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+ if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
+ from->tunnel_ext))
+ return -EINVAL;
+
+ skb_dst_drop(skb);
+ dst_hold((struct dst_entry *) md);
+ skb_dst_set(skb, (struct dst_entry *) md);
+
+ info = &md->u.tun_info;
+ memset(info, 0, sizeof(*info));
+ info->mode = IP_TUNNEL_INFO_TX;
+
+ info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
+ if (flags & BPF_F_DONT_FRAGMENT)
+ info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+ if (flags & BPF_F_ZERO_CSUM_TX)
+ info->key.tun_flags &= ~TUNNEL_CSUM;
+ if (flags & BPF_F_SEQ_NUMBER)
+ info->key.tun_flags |= TUNNEL_SEQ;
+
+ info->key.tun_id = cpu_to_be64(from->tunnel_id);
+ info->key.tos = from->tunnel_tos;
+ info->key.ttl = from->tunnel_ttl;
+
+ if (flags & BPF_F_TUNINFO_IPV6) {
+ info->mode |= IP_TUNNEL_INFO_IPV6;
+ memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
+ sizeof(from->remote_ipv6));
+ info->key.label = cpu_to_be32(from->tunnel_label) &
+ IPV6_FLOWLABEL_MASK;
+ } else {
+ info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
+ }
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
+ .func = bpf_skb_set_tunnel_key,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
+ const u8 *, from, u32, size)
+{
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ const struct metadata_dst *md = this_cpu_ptr(md_dst);
+
+ if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
+ return -EINVAL;
+ if (unlikely(size > IP_TUNNEL_OPTS_MAX))
+ return -ENOMEM;
+
+ ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
+ .func = bpf_skb_set_tunnel_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+static const struct bpf_func_proto *
+bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
+{
+ if (!md_dst) {
+ struct metadata_dst __percpu *tmp;
+
+ tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
+ METADATA_IP_TUNNEL,
+ GFP_KERNEL);
+ if (!tmp)
+ return NULL;
+ if (cmpxchg(&md_dst, NULL, tmp))
+ metadata_dst_free_percpu(tmp);
+ }
+
+ switch (which) {
+ case BPF_FUNC_skb_set_tunnel_key:
+ return &bpf_skb_set_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return &bpf_skb_set_tunnel_opt_proto;
+ default:
+ return NULL;
+ }
+}
+
+BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
+ u32, idx)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct cgroup *cgrp;
+ struct sock *sk;
+
+ sk = skb_to_full_sk(skb);
+ if (!sk || !sk_fullsock(sk))
+ return -ENOENT;
+ if (unlikely(idx >= array->map.max_entries))
+ return -E2BIG;
+
+ cgrp = READ_ONCE(array->ptrs[idx]);
+ if (unlikely(!cgrp))
+ return -EAGAIN;
+
+ return sk_under_cgroup_hierarchy(sk, cgrp);
+}
+
+static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
+ .func = bpf_skb_under_cgroup,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
+{
+ struct sock *sk = skb_to_full_sk(skb);
+ struct cgroup *cgrp;
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ return cgrp->kn->id.id;
+}
+
+static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
+ .func = bpf_skb_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
+ ancestor_level)
+{
+ struct sock *sk = skb_to_full_sk(skb);
+ struct cgroup *ancestor;
+ struct cgroup *cgrp;
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ ancestor = cgroup_ancestor(cgrp, ancestor_level);
+ if (!ancestor)
+ return 0;
+
+ return ancestor->kn->id.id;
+}
+
+static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
+ .func = bpf_skb_ancestor_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+#endif
+
+static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
+ unsigned long off, unsigned long len)
+{
+ memcpy(dst_buff, src_buff + off, len);
+ return 0;
+}
+
+BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
+ u64, flags, void *, meta, u64, meta_size)
+{
+ u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
+
+ if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
+ return -EINVAL;
+ if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
+ return -EFAULT;
+
+ return bpf_event_output(map, flags, meta, meta_size, xdp->data,
+ xdp_size, bpf_xdp_copy);
+}
+
+static const struct bpf_func_proto bpf_xdp_event_output_proto = {
+ .func = bpf_xdp_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
+{
+ return skb->sk ? sock_gen_cookie(skb->sk) : 0;
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
+ .func = bpf_get_socket_cookie,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
+{
+ return sock_gen_cookie(ctx->sk);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
+ .func = bpf_get_socket_cookie_sock_addr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
+{
+ return sock_gen_cookie(ctx->sk);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
+ .func = bpf_get_socket_cookie_sock_ops,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
+{
+ struct sock *sk = sk_to_full_sk(skb->sk);
+ kuid_t kuid;
+
+ if (!sk || !sk_fullsock(sk))
+ return overflowuid;
+ kuid = sock_net_uid(sock_net(sk), sk);
+ return from_kuid_munged(sock_net(sk)->user_ns, kuid);
+}
+
+static const struct bpf_func_proto bpf_get_socket_uid_proto = {
+ .func = bpf_get_socket_uid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ struct sock *sk = bpf_sock->sk;
+ int ret = 0;
+ int val;
+
+ if (!sk_fullsock(sk))
+ return -EINVAL;
+
+ if (level == SOL_SOCKET) {
+ if (optlen != sizeof(int))
+ return -EINVAL;
+ val = *((int *)optval);
+
+ /* Only some socketops are supported */
+ switch (optname) {
+ case SO_RCVBUF:
+ val = min_t(u32, val, sysctl_rmem_max);
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
+ break;
+ case SO_SNDBUF:
+ val = min_t(u32, val, sysctl_wmem_max);
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
+ break;
+ case SO_MAX_PACING_RATE:
+ sk->sk_max_pacing_rate = val;
+ sk->sk_pacing_rate = min(sk->sk_pacing_rate,
+ sk->sk_max_pacing_rate);
+ break;
+ case SO_PRIORITY:
+ sk->sk_priority = val;
+ break;
+ case SO_RCVLOWAT:
+ if (val < 0)
+ val = INT_MAX;
+ sk->sk_rcvlowat = val ? : 1;
+ break;
+ case SO_MARK:
+ if (sk->sk_mark != val) {
+ sk->sk_mark = val;
+ sk_dst_reset(sk);
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ }
+#ifdef CONFIG_INET
+ } else if (level == SOL_IP) {
+ if (optlen != sizeof(int) || sk->sk_family != AF_INET)
+ return -EINVAL;
+
+ val = *((int *)optval);
+ /* Only some options are supported */
+ switch (optname) {
+ case IP_TOS:
+ if (val < -1 || val > 0xff) {
+ ret = -EINVAL;
+ } else {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (val == -1)
+ val = 0;
+ inet->tos = val;
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (level == SOL_IPV6) {
+ if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+ return -EINVAL;
+
+ val = *((int *)optval);
+ /* Only some options are supported */
+ switch (optname) {
+ case IPV6_TCLASS:
+ if (val < -1 || val > 0xff) {
+ ret = -EINVAL;
+ } else {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ if (val == -1)
+ val = 0;
+ np->tclass = val;
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ }
+#endif
+ } else if (level == SOL_TCP &&
+ sk->sk_prot->setsockopt == tcp_setsockopt) {
+ if (optname == TCP_CONGESTION) {
+ char name[TCP_CA_NAME_MAX];
+ bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN;
+
+ strncpy(name, optval, min_t(long, optlen,
+ TCP_CA_NAME_MAX-1));
+ name[TCP_CA_NAME_MAX-1] = 0;
+ ret = tcp_set_congestion_control(sk, name, false,
+ reinit, true);
+ } else {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ val = *((int *)optval);
+ /* Only some options are supported */
+ switch (optname) {
+ case TCP_BPF_IW:
+ if (val <= 0 || tp->data_segs_out > tp->syn_data)
+ ret = -EINVAL;
+ else
+ tp->snd_cwnd = val;
+ break;
+ case TCP_BPF_SNDCWND_CLAMP:
+ if (val <= 0) {
+ ret = -EINVAL;
+ } else {
+ tp->snd_cwnd_clamp = val;
+ tp->snd_ssthresh = val;
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ }
+#endif
+ } else {
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_setsockopt_proto = {
+ .func = bpf_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ struct sock *sk = bpf_sock->sk;
+
+ if (!sk_fullsock(sk))
+ goto err_clear;
+
+#ifdef CONFIG_INET
+ if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
+ if (optname == TCP_CONGESTION) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (!icsk->icsk_ca_ops || optlen <= 1)
+ goto err_clear;
+ strncpy(optval, icsk->icsk_ca_ops->name, optlen);
+ optval[optlen - 1] = 0;
+ } else {
+ goto err_clear;
+ }
+ } else if (level == SOL_IP) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (optlen != sizeof(int) || sk->sk_family != AF_INET)
+ goto err_clear;
+
+ /* Only some options are supported */
+ switch (optname) {
+ case IP_TOS:
+ *((int *)optval) = (int)inet->tos;
+ break;
+ default:
+ goto err_clear;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (level == SOL_IPV6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+ goto err_clear;
+
+ /* Only some options are supported */
+ switch (optname) {
+ case IPV6_TCLASS:
+ *((int *)optval) = (int)np->tclass;
+ break;
+ default:
+ goto err_clear;
+ }
+#endif
+ } else {
+ goto err_clear;
+ }
+ return 0;
+#endif
+err_clear:
+ memset(optval, 0, optlen);
+ return -EINVAL;
+}
+
+static const struct bpf_func_proto bpf_getsockopt_proto = {
+ .func = bpf_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
+ int, argval)
+{
+ struct sock *sk = bpf_sock->sk;
+ int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
+
+ if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
+ return -EINVAL;
+
+ if (val)
+ tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+
+ return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
+}
+
+static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
+ .func = bpf_sock_ops_cb_flags_set,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
+EXPORT_SYMBOL_GPL(ipv6_bpf_stub);
+
+BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
+ int, addr_len)
+{
+#ifdef CONFIG_INET
+ struct sock *sk = ctx->sk;
+ int err;
+
+ /* Binding to port can be expensive so it's prohibited in the helper.
+ * Only binding to IP is supported.
+ */
+ err = -EINVAL;
+ if (addr->sa_family == AF_INET) {
+ if (addr_len < sizeof(struct sockaddr_in))
+ return err;
+ if (((struct sockaddr_in *)addr)->sin_port != htons(0))
+ return err;
+ return __inet_bind(sk, addr, addr_len, true, false);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (addr->sa_family == AF_INET6) {
+ if (addr_len < SIN6_LEN_RFC2133)
+ return err;
+ if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
+ return err;
+ /* ipv6_bpf_stub cannot be NULL, since it's called from
+ * bpf_cgroup_inet6_connect hook and ipv6 is already loaded
+ */
+ return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false);
+#endif /* CONFIG_IPV6 */
+ }
+#endif /* CONFIG_INET */
+
+ return -EAFNOSUPPORT;
+}
+
+static const struct bpf_func_proto bpf_bind_proto = {
+ .func = bpf_bind,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+#ifdef CONFIG_XFRM
+BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
+ struct bpf_xfrm_state *, to, u32, size, u64, flags)
+{
+ const struct sec_path *sp = skb_sec_path(skb);
+ const struct xfrm_state *x;
+
+ if (!sp || unlikely(index >= sp->len || flags))
+ goto err_clear;
+
+ x = sp->xvec[index];
+
+ if (unlikely(size != sizeof(struct bpf_xfrm_state)))
+ goto err_clear;
+
+ to->reqid = x->props.reqid;
+ to->spi = x->id.spi;
+ to->family = x->props.family;
+ to->ext = 0;
+
+ if (to->family == AF_INET6) {
+ memcpy(to->remote_ipv6, x->props.saddr.a6,
+ sizeof(to->remote_ipv6));
+ } else {
+ to->remote_ipv4 = x->props.saddr.a4;
+ memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+ }
+
+ return 0;
+err_clear:
+ memset(to, 0, size);
+ return -EINVAL;
+}
+
+static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
+ .func = bpf_skb_get_xfrm_state,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+#endif
+
+#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
+ const struct neighbour *neigh,
+ const struct net_device *dev)
+{
+ memcpy(params->dmac, neigh->ha, ETH_ALEN);
+ memcpy(params->smac, dev->dev_addr, ETH_ALEN);
+ params->h_vlan_TCI = 0;
+ params->h_vlan_proto = 0;
+ params->ifindex = dev->ifindex;
+
+ return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+ u32 flags, bool check_mtu)
+{
+ struct in_device *in_dev;
+ struct neighbour *neigh;
+ struct net_device *dev;
+ struct fib_result res;
+ struct fib_nh *nh;
+ struct flowi4 fl4;
+ int err;
+ u32 mtu;
+
+ dev = dev_get_by_index_rcu(net, params->ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ /* verify forwarding is enabled on this interface */
+ in_dev = __in_dev_get_rcu(dev);
+ if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+ return BPF_FIB_LKUP_RET_FWD_DISABLED;
+
+ if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+ fl4.flowi4_iif = 1;
+ fl4.flowi4_oif = params->ifindex;
+ } else {
+ fl4.flowi4_iif = params->ifindex;
+ fl4.flowi4_oif = 0;
+ }
+ fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_flags = 0;
+
+ fl4.flowi4_proto = params->l4_protocol;
+ fl4.daddr = params->ipv4_dst;
+ fl4.saddr = params->ipv4_src;
+ fl4.fl4_sport = params->sport;
+ fl4.fl4_dport = params->dport;
+
+ if (flags & BPF_FIB_LOOKUP_DIRECT) {
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+ struct fib_table *tb;
+
+ tb = fib_get_table(net, tbid);
+ if (unlikely(!tb))
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+
+ err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+ } else {
+ fl4.flowi4_mark = 0;
+ fl4.flowi4_secid = 0;
+ fl4.flowi4_tun_key.tun_id = 0;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
+
+ err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
+ }
+
+ if (err) {
+ /* map fib lookup errors to RTN_ type */
+ if (err == -EINVAL)
+ return BPF_FIB_LKUP_RET_BLACKHOLE;
+ if (err == -EHOSTUNREACH)
+ return BPF_FIB_LKUP_RET_UNREACHABLE;
+ if (err == -EACCES)
+ return BPF_FIB_LKUP_RET_PROHIBIT;
+
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+ }
+
+ if (res.type != RTN_UNICAST)
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+
+ if (res.fi->fib_nhs > 1)
+ fib_select_path(net, &res, &fl4, NULL);
+
+ if (check_mtu) {
+ mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
+ if (params->tot_len > mtu)
+ return BPF_FIB_LKUP_RET_FRAG_NEEDED;
+ }
+
+ nh = &res.fi->fib_nh[res.nh_sel];
+
+ /* do not handle lwt encaps right now */
+ if (nh->nh_lwtstate)
+ return BPF_FIB_LKUP_RET_UNSUPP_LWT;
+
+ dev = nh->nh_dev;
+ if (nh->nh_gw)
+ params->ipv4_dst = nh->nh_gw;
+
+ params->rt_metric = res.fi->fib_priority;
+
+ /* xdp and cls_bpf programs are run in RCU-bh so
+ * rcu_read_lock_bh is not needed here
+ */
+ neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
+ if (!neigh)
+ return BPF_FIB_LKUP_RET_NO_NEIGH;
+
+ return bpf_fib_set_fwd_params(params, neigh, dev);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+ u32 flags, bool check_mtu)
+{
+ struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
+ struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
+ struct neighbour *neigh;
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ struct fib6_info *f6i;
+ struct flowi6 fl6;
+ int strict = 0;
+ int oif;
+ u32 mtu;
+
+ /* link local addresses are never forwarded */
+ if (rt6_need_strict(dst) || rt6_need_strict(src))
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+
+ dev = dev_get_by_index_rcu(net, params->ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ idev = __in6_dev_get_safely(dev);
+ if (unlikely(!idev || !idev->cnf.forwarding))
+ return BPF_FIB_LKUP_RET_FWD_DISABLED;
+
+ if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+ fl6.flowi6_iif = 1;
+ oif = fl6.flowi6_oif = params->ifindex;
+ } else {
+ oif = fl6.flowi6_iif = params->ifindex;
+ fl6.flowi6_oif = 0;
+ strict = RT6_LOOKUP_F_HAS_SADDR;
+ }
+ fl6.flowlabel = params->flowinfo;
+ fl6.flowi6_scope = 0;
+ fl6.flowi6_flags = 0;
+ fl6.mp_hash = 0;
+
+ fl6.flowi6_proto = params->l4_protocol;
+ fl6.daddr = *dst;
+ fl6.saddr = *src;
+ fl6.fl6_sport = params->sport;
+ fl6.fl6_dport = params->dport;
+
+ if (flags & BPF_FIB_LOOKUP_DIRECT) {
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+ struct fib6_table *tb;
+
+ tb = ipv6_stub->fib6_get_table(net, tbid);
+ if (unlikely(!tb))
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+
+ f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
+ } else {
+ fl6.flowi6_mark = 0;
+ fl6.flowi6_secid = 0;
+ fl6.flowi6_tun_key.tun_id = 0;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
+
+ f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
+ }
+
+ if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+
+ if (unlikely(f6i->fib6_flags & RTF_REJECT)) {
+ switch (f6i->fib6_type) {
+ case RTN_BLACKHOLE:
+ return BPF_FIB_LKUP_RET_BLACKHOLE;
+ case RTN_UNREACHABLE:
+ return BPF_FIB_LKUP_RET_UNREACHABLE;
+ case RTN_PROHIBIT:
+ return BPF_FIB_LKUP_RET_PROHIBIT;
+ default:
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+ }
+ }
+
+ if (f6i->fib6_type != RTN_UNICAST)
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+
+ if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
+ f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
+ fl6.flowi6_oif, NULL,
+ strict);
+
+ if (check_mtu) {
+ mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
+ if (params->tot_len > mtu)
+ return BPF_FIB_LKUP_RET_FRAG_NEEDED;
+ }
+
+ if (f6i->fib6_nh.nh_lwtstate)
+ return BPF_FIB_LKUP_RET_UNSUPP_LWT;
+
+ if (f6i->fib6_flags & RTF_GATEWAY)
+ *dst = f6i->fib6_nh.nh_gw;
+
+ dev = f6i->fib6_nh.nh_dev;
+ params->rt_metric = f6i->fib6_metric;
+
+ /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
+ * not needed here. Can not use __ipv6_neigh_lookup_noref here
+ * because we need to get nd_tbl via the stub
+ */
+ neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
+ ndisc_hashfn, dst, dev);
+ if (!neigh)
+ return BPF_FIB_LKUP_RET_NO_NEIGH;
+
+ return bpf_fib_set_fwd_params(params, neigh, dev);
+}
+#endif
+
+BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
+ struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+ if (plen < sizeof(*params))
+ return -EINVAL;
+
+ if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+ return -EINVAL;
+
+ switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+ case AF_INET:
+ return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
+ flags, true);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
+ flags, true);
+#endif
+ }
+ return -EAFNOSUPPORT;
+}
+
+static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
+ .func = bpf_xdp_fib_lookup,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
+ struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+ struct net *net = dev_net(skb->dev);
+ int rc = -EAFNOSUPPORT;
+ bool check_mtu = false;
+
+ if (plen < sizeof(*params))
+ return -EINVAL;
+
+ if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+ return -EINVAL;
+
+ if (params->tot_len)
+ check_mtu = true;
+
+ switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+ case AF_INET:
+ rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu);
+ break;
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu);
+ break;
+#endif
+ }
+
+ if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
+ struct net_device *dev;
+
+ /* When tot_len isn't provided by user, check skb
+ * against MTU of FIB lookup resulting net_device
+ */
+ dev = dev_get_by_index_rcu(net, params->ifindex);
+ if (!is_skb_forwardable(dev, skb))
+ rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
+ }
+
+ return rc;
+}
+
+static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
+ .func = bpf_skb_fib_lookup,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+{
+ int err;
+ struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
+
+ if (!seg6_validate_srh(srh, len))
+ return -EINVAL;
+
+ switch (type) {
+ case BPF_LWT_ENCAP_SEG6_INLINE:
+ if (skb_protocol(skb, true) != htons(ETH_P_IPV6))
+ return -EBADMSG;
+
+ err = seg6_do_srh_inline(skb, srh);
+ break;
+ case BPF_LWT_ENCAP_SEG6:
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ bpf_compute_data_pointers(skb);
+ if (err)
+ return err;
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ return seg6_lookup_nexthop(skb, NULL, 0);
+}
+#endif /* CONFIG_IPV6_SEG6_BPF */
+
+BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
+ u32, len)
+{
+ switch (type) {
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ case BPF_LWT_ENCAP_SEG6:
+ case BPF_LWT_ENCAP_SEG6_INLINE:
+ return bpf_push_seg6_encap(skb, type, hdr, len);
+#endif
+ default:
+ return -EINVAL;
+ }
+}
+
+static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
+ .func = bpf_lwt_push_encap,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
+ const void *, from, u32, len)
+{
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ struct ipv6_sr_hdr *srh = srh_state->srh;
+ void *srh_tlvs, *srh_end, *ptr;
+ int srhoff = 0;
+
+ if (srh == NULL)
+ return -EINVAL;
+
+ srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
+ srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
+
+ ptr = skb->data + offset;
+ if (ptr >= srh_tlvs && ptr + len <= srh_end)
+ srh_state->valid = false;
+ else if (ptr < (void *)&srh->flags ||
+ ptr + len > (void *)&srh->segments)
+ return -EFAULT;
+
+ if (unlikely(bpf_try_make_writable(skb, offset + len)))
+ return -EFAULT;
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return -EINVAL;
+ srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+ memcpy(skb->data + offset, from, len);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
+ .func = bpf_lwt_seg6_store_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+static void bpf_update_srh_state(struct sk_buff *skb)
+{
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ int srhoff = 0;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) {
+ srh_state->srh = NULL;
+ } else {
+ srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+ srh_state->hdrlen = srh_state->srh->hdrlen << 3;
+ srh_state->valid = true;
+ }
+}
+
+BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
+ u32, action, void *, param, u32, param_len)
+{
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ int hdroff = 0;
+ int err;
+
+ switch (action) {
+ case SEG6_LOCAL_ACTION_END_X:
+ if (!seg6_bpf_has_valid_srh(skb))
+ return -EBADMSG;
+ if (param_len != sizeof(struct in6_addr))
+ return -EINVAL;
+ return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
+ case SEG6_LOCAL_ACTION_END_T:
+ if (!seg6_bpf_has_valid_srh(skb))
+ return -EBADMSG;
+ if (param_len != sizeof(int))
+ return -EINVAL;
+ return seg6_lookup_nexthop(skb, NULL, *(int *)param);
+ case SEG6_LOCAL_ACTION_END_DT6:
+ if (!seg6_bpf_has_valid_srh(skb))
+ return -EBADMSG;
+ if (param_len != sizeof(int))
+ return -EINVAL;
+
+ if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0)
+ return -EBADMSG;
+ if (!pskb_pull(skb, hdroff))
+ return -EBADMSG;
+
+ skb_postpull_rcsum(skb, skb_network_header(skb), hdroff);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb->encapsulation = 0;
+
+ bpf_compute_data_pointers(skb);
+ bpf_update_srh_state(skb);
+ return seg6_lookup_nexthop(skb, NULL, *(int *)param);
+ case SEG6_LOCAL_ACTION_END_B6:
+ if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
+ return -EBADMSG;
+ err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
+ param, param_len);
+ if (!err)
+ bpf_update_srh_state(skb);
+
+ return err;
+ case SEG6_LOCAL_ACTION_END_B6_ENCAP:
+ if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
+ return -EBADMSG;
+ err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
+ param, param_len);
+ if (!err)
+ bpf_update_srh_state(skb);
+
+ return err;
+ default:
+ return -EINVAL;
+ }
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
+ .func = bpf_lwt_seg6_action,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
+ s32, len)
+{
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ struct ipv6_sr_hdr *srh = srh_state->srh;
+ void *srh_end, *srh_tlvs, *ptr;
+ struct ipv6hdr *hdr;
+ int srhoff = 0;
+ int ret;
+
+ if (unlikely(srh == NULL))
+ return -EINVAL;
+
+ srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
+ ((srh->first_segment + 1) << 4));
+ srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
+ srh_state->hdrlen);
+ ptr = skb->data + offset;
+
+ if (unlikely(ptr < srh_tlvs || ptr > srh_end))
+ return -EFAULT;
+ if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
+ return -EFAULT;
+
+ if (len > 0) {
+ ret = skb_cow_head(skb, len);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_push(skb, offset, len);
+ } else {
+ ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
+ }
+
+ bpf_compute_data_pointers(skb);
+ if (unlikely(ret < 0))
+ return ret;
+
+ hdr = (struct ipv6hdr *)skb->data;
+ hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return -EINVAL;
+ srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+ srh_state->hdrlen += len;
+ srh_state->valid = false;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
+ .func = bpf_lwt_seg6_adjust_srh,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+#endif /* CONFIG_IPV6_SEG6_BPF */
+
+bool bpf_helper_changes_pkt_data(void *func)
+{
+ if (func == bpf_skb_vlan_push ||
+ func == bpf_skb_vlan_pop ||
+ func == bpf_skb_store_bytes ||
+ func == bpf_skb_change_proto ||
+ func == bpf_skb_change_head ||
+ func == sk_skb_change_head ||
+ func == bpf_skb_change_tail ||
+ func == sk_skb_change_tail ||
+ func == bpf_skb_adjust_room ||
+ func == bpf_skb_pull_data ||
+ func == sk_skb_pull_data ||
+ func == bpf_clone_redirect ||
+ func == bpf_l3_csum_replace ||
+ func == bpf_l4_csum_replace ||
+ func == bpf_xdp_adjust_head ||
+ func == bpf_xdp_adjust_meta ||
+ func == bpf_msg_pull_data ||
+ func == bpf_xdp_adjust_tail ||
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ func == bpf_lwt_seg6_store_bytes ||
+ func == bpf_lwt_seg6_adjust_srh ||
+ func == bpf_lwt_seg6_action ||
+#endif
+ func == bpf_lwt_push_encap)
+ return true;
+
+ return false;
+}
+
+static const struct bpf_func_proto *
+bpf_base_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_get_prandom_u32:
+ return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_raw_smp_processor_id_proto;
+ case BPF_FUNC_get_numa_node_id:
+ return &bpf_get_numa_node_id_proto;
+ case BPF_FUNC_tail_call:
+ return &bpf_tail_call_proto;
+ case BPF_FUNC_ktime_get_ns:
+ return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_trace_printk:
+ if (capable(CAP_SYS_ADMIN))
+ return bpf_get_trace_printk_proto();
+ /* else: fall through */
+ default:
+ return NULL;
+ }
+}
+
+static const struct bpf_func_proto *
+sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ /* inet and inet6 sockets are created in a process
+ * context so there is always a valid uid/gid
+ */
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ /* inet and inet6 sockets are created in a process
+ * context so there is always a valid uid/gid
+ */
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_bind:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ return &bpf_bind_proto;
+ default:
+ return NULL;
+ }
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_sock_addr_proto;
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_load_bytes_relative:
+ return &bpf_skb_load_bytes_relative_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_socket_uid:
+ return &bpf_get_socket_uid_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+ default:
+ return sk_filter_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_load_bytes_relative:
+ return &bpf_skb_load_bytes_relative_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
+ case BPF_FUNC_csum_update:
+ return &bpf_csum_update_proto;
+ case BPF_FUNC_l3_csum_replace:
+ return &bpf_l3_csum_replace_proto;
+ case BPF_FUNC_l4_csum_replace:
+ return &bpf_l4_csum_replace_proto;
+ case BPF_FUNC_clone_redirect:
+ return &bpf_clone_redirect_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_skb_vlan_push:
+ return &bpf_skb_vlan_push_proto;
+ case BPF_FUNC_skb_vlan_pop:
+ return &bpf_skb_vlan_pop_proto;
+ case BPF_FUNC_skb_change_proto:
+ return &bpf_skb_change_proto_proto;
+ case BPF_FUNC_skb_change_type:
+ return &bpf_skb_change_type_proto;
+ case BPF_FUNC_skb_adjust_room:
+ return &bpf_skb_adjust_room_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_get_tunnel_key:
+ return &bpf_skb_get_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_key:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_skb_get_tunnel_opt:
+ return &bpf_skb_get_tunnel_opt_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_redirect:
+ return &bpf_redirect_proto;
+ case BPF_FUNC_get_route_realm:
+ return &bpf_get_route_realm_proto;
+ case BPF_FUNC_get_hash_recalc:
+ return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_set_hash_invalid:
+ return &bpf_set_hash_invalid_proto;
+ case BPF_FUNC_set_hash:
+ return &bpf_set_hash_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_skb_under_cgroup:
+ return &bpf_skb_under_cgroup_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_socket_uid:
+ return &bpf_get_socket_uid_proto;
+ case BPF_FUNC_fib_lookup:
+ return &bpf_skb_fib_lookup_proto;
+#ifdef CONFIG_XFRM
+ case BPF_FUNC_skb_get_xfrm_state:
+ return &bpf_skb_get_xfrm_state_proto;
+#endif
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case BPF_FUNC_skb_cgroup_id:
+ return &bpf_skb_cgroup_id_proto;
+ case BPF_FUNC_skb_ancestor_cgroup_id:
+ return &bpf_skb_ancestor_cgroup_id_proto;
+#endif
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_perf_event_output:
+ return &bpf_xdp_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
+ case BPF_FUNC_xdp_adjust_head:
+ return &bpf_xdp_adjust_head_proto;
+ case BPF_FUNC_xdp_adjust_meta:
+ return &bpf_xdp_adjust_meta_proto;
+ case BPF_FUNC_redirect:
+ return &bpf_xdp_redirect_proto;
+ case BPF_FUNC_redirect_map:
+ return &bpf_xdp_redirect_map_proto;
+ case BPF_FUNC_xdp_adjust_tail:
+ return &bpf_xdp_adjust_tail_proto;
+ case BPF_FUNC_fib_lookup:
+ return &bpf_xdp_fib_lookup_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_setsockopt:
+ return &bpf_setsockopt_proto;
+ case BPF_FUNC_getsockopt:
+ return &bpf_getsockopt_proto;
+ case BPF_FUNC_sock_ops_cb_flags_set:
+ return &bpf_sock_ops_cb_flags_set_proto;
+ case BPF_FUNC_sock_map_update:
+ return &bpf_sock_map_update_proto;
+ case BPF_FUNC_sock_hash_update:
+ return &bpf_sock_hash_update_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_sock_ops_proto;
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_msg_redirect_map:
+ return &bpf_msg_redirect_map_proto;
+ case BPF_FUNC_msg_redirect_hash:
+ return &bpf_msg_redirect_hash_proto;
+ case BPF_FUNC_msg_apply_bytes:
+ return &bpf_msg_apply_bytes_proto;
+ case BPF_FUNC_msg_cork_bytes:
+ return &bpf_msg_cork_bytes_proto;
+ case BPF_FUNC_msg_pull_data:
+ return &bpf_msg_pull_data_proto;
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &sk_skb_pull_data_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &sk_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &sk_skb_change_head_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_socket_uid:
+ return &bpf_get_socket_uid_proto;
+ case BPF_FUNC_sk_redirect_map:
+ return &bpf_sk_redirect_map_proto;
+ case BPF_FUNC_sk_redirect_hash:
+ return &bpf_sk_redirect_hash_proto;
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_get_route_realm:
+ return &bpf_get_route_realm_proto;
+ case BPF_FUNC_get_hash_recalc:
+ return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_skb_under_cgroup:
+ return &bpf_skb_under_cgroup_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_lwt_push_encap:
+ return &bpf_lwt_push_encap_proto;
+ default:
+ return lwt_out_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_get_tunnel_key:
+ return &bpf_skb_get_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_key:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_skb_get_tunnel_opt:
+ return &bpf_skb_get_tunnel_opt_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_redirect:
+ return &bpf_redirect_proto;
+ case BPF_FUNC_clone_redirect:
+ return &bpf_clone_redirect_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &bpf_skb_change_head_proto;
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_csum_update:
+ return &bpf_csum_update_proto;
+ case BPF_FUNC_l3_csum_replace:
+ return &bpf_l3_csum_replace_proto;
+ case BPF_FUNC_l4_csum_replace:
+ return &bpf_l4_csum_replace_proto;
+ case BPF_FUNC_set_hash_invalid:
+ return &bpf_set_hash_invalid_proto;
+ default:
+ return lwt_out_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ case BPF_FUNC_lwt_seg6_store_bytes:
+ return &bpf_lwt_seg6_store_bytes_proto;
+ case BPF_FUNC_lwt_seg6_action:
+ return &bpf_lwt_seg6_action_proto;
+ case BPF_FUNC_lwt_seg6_adjust_srh:
+ return &bpf_lwt_seg6_adjust_srh_proto;
+#endif
+ default:
+ return lwt_out_func_proto(func_id, prog);
+ }
+}
+
+static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct __sk_buff))
+ return false;
+
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ if (off + size > offsetofend(struct __sk_buff, cb[4]))
+ return false;
+ break;
+ case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
+ case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
+ case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
+ case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
+ case bpf_ctx_range(struct __sk_buff, data):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ if (size != size_default)
+ return false;
+ break;
+ default:
+ /* Only narrow read access allowed for now. */
+ if (type == BPF_WRITE) {
+ if (size != size_default)
+ return false;
+ } else {
+ bpf_ctx_record_field_size(info, size_default);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool sk_filter_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range(struct __sk_buff, data):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+ return false;
+ }
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
+static bool lwt_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ return false;
+ }
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, mark):
+ case bpf_ctx_range(struct __sk_buff, priority):
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
+/* Attach type specific accesses */
+static bool __sock_filter_check_attach_type(int off,
+ enum bpf_access_type access_type,
+ enum bpf_attach_type attach_type)
+{
+ switch (off) {
+ case offsetof(struct bpf_sock, bound_dev_if):
+ case offsetof(struct bpf_sock, mark):
+ case offsetof(struct bpf_sock, priority):
+ switch (attach_type) {
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ goto full_access;
+ default:
+ return false;
+ }
+ case bpf_ctx_range(struct bpf_sock, src_ip4):
+ switch (attach_type) {
+ case BPF_CGROUP_INET4_POST_BIND:
+ goto read_only;
+ default:
+ return false;
+ }
+ case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+ switch (attach_type) {
+ case BPF_CGROUP_INET6_POST_BIND:
+ goto read_only;
+ default:
+ return false;
+ }
+ case bpf_ctx_range(struct bpf_sock, src_port):
+ switch (attach_type) {
+ case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_INET6_POST_BIND:
+ goto read_only;
+ default:
+ return false;
+ }
+ }
+read_only:
+ return access_type == BPF_READ;
+full_access:
+ return true;
+}
+
+static bool __sock_filter_check_size(int off, int size,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_sock, src_ip4):
+ case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size, size_default);
+ }
+
+ return size == size_default;
+}
+
+static bool sock_filter_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= sizeof(struct bpf_sock))
+ return false;
+ if (off % size != 0)
+ return false;
+ if (!__sock_filter_check_attach_type(off, type,
+ prog->expected_attach_type))
+ return false;
+ if (!__sock_filter_check_size(off, size, info))
+ return false;
+ return true;
+}
+
+static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog, int drop_verdict)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (!direct_write)
+ return 0;
+
+ /* if (!skb->cloned)
+ * goto start;
+ *
+ * (Fast-path, otherwise approximation that we might be
+ * a clone, do the rest in helper.)
+ */
+ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
+
+ /* ret = bpf_skb_pull_data(skb, 0); */
+ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+ *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
+ *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_skb_pull_data);
+ /* if (!ret)
+ * goto restore;
+ * return TC_ACT_SHOT;
+ */
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
+ *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict);
+ *insn++ = BPF_EXIT_INSN();
+
+ /* restore: */
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+ /* start: */
+ *insn++ = prog->insnsi[0];
+
+ return insn - insn_buf;
+}
+
+static int bpf_gen_ld_abs(const struct bpf_insn *orig,
+ struct bpf_insn *insn_buf)
+{
+ bool indirect = BPF_MODE(orig->code) == BPF_IND;
+ struct bpf_insn *insn = insn_buf;
+
+ if (!indirect) {
+ *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
+ } else {
+ *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
+ if (orig->imm)
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
+ }
+ /* We're guaranteed here that CTX is in R6. */
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
+
+ switch (BPF_SIZE(orig->code)) {
+ case BPF_B:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
+ break;
+ case BPF_H:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
+ break;
+ case BPF_W:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
+ break;
+ }
+
+ *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
+ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
+ *insn++ = BPF_EXIT_INSN();
+
+ return insn - insn_buf;
+}
+
+static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT);
+}
+
+static bool tc_cls_act_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, mark):
+ case bpf_ctx_range(struct __sk_buff, tc_index):
+ case bpf_ctx_range(struct __sk_buff, priority):
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ info->reg_type = PTR_TO_PACKET_META;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+ return false;
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
+static bool __is_valid_xdp_access(int off, int size)
+{
+ if (off < 0 || off >= sizeof(struct xdp_md))
+ return false;
+ if (off % size != 0)
+ return false;
+ if (size != sizeof(__u32))
+ return false;
+
+ return true;
+}
+
+static bool xdp_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE) {
+ if (bpf_prog_is_dev_bound(prog->aux)) {
+ switch (off) {
+ case offsetof(struct xdp_md, rx_queue_index):
+ return __is_valid_xdp_access(off, size);
+ }
+ }
+ return false;
+ }
+
+ switch (off) {
+ case offsetof(struct xdp_md, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct xdp_md, data_meta):
+ info->reg_type = PTR_TO_PACKET_META;
+ break;
+ case offsetof(struct xdp_md, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return __is_valid_xdp_access(off, size);
+}
+
+void bpf_warn_invalid_xdp_action(u32 act)
+{
+ const u32 act_max = XDP_REDIRECT;
+
+ pr_warn_once("%s XDP return value %u, expect packet loss!\n",
+ act > act_max ? "Illegal" : "Driver unsupported",
+ act);
+}
+EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
+
+static bool sock_addr_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct bpf_sock_addr))
+ return false;
+ if (off % size != 0)
+ return false;
+
+ /* Disallow access to IPv6 fields from IPv4 contex and vise
+ * versa.
+ */
+ switch (off) {
+ case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ break;
+ default:
+ return false;
+ }
+ break;
+ case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ break;
+ default:
+ return false;
+ }
+ break;
+ case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_UDP4_SENDMSG:
+ break;
+ default:
+ return false;
+ }
+ break;
+ case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+ msg_src_ip6[3]):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_UDP6_SENDMSG:
+ break;
+ default:
+ return false;
+ }
+ break;
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
+ case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+ case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+ case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+ msg_src_ip6[3]):
+ /* Only narrow read access allowed for now. */
+ if (type == BPF_READ) {
+ bpf_ctx_record_field_size(info, size_default);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+ return false;
+ } else {
+ if (size != size_default)
+ return false;
+ }
+ break;
+ case bpf_ctx_range(struct bpf_sock_addr, user_port):
+ if (size != size_default)
+ return false;
+ break;
+ default:
+ if (type == BPF_READ) {
+ if (size != size_default)
+ return false;
+ } else {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool sock_ops_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct bpf_sock_ops))
+ return false;
+
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct bpf_sock_ops, reply):
+ case offsetof(struct bpf_sock_ops, sk_txhash):
+ if (size != size_default)
+ return false;
+ break;
+ default:
+ return false;
+ }
+ } else {
+ switch (off) {
+ case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
+ bytes_acked):
+ if (size != sizeof(__u64))
+ return false;
+ break;
+ default:
+ if (size != size_default)
+ return false;
+ break;
+ }
+ }
+
+ return true;
+}
+
+static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP);
+}
+
+static bool sk_skb_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ return false;
+ }
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, tc_index):
+ case bpf_ctx_range(struct __sk_buff, priority):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, mark):
+ return false;
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
+static bool sk_msg_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE)
+ return false;
+
+ switch (off) {
+ case offsetof(struct sk_msg_md, data):
+ info->reg_type = PTR_TO_PACKET;
+ if (size != sizeof(__u64))
+ return false;
+ break;
+ case offsetof(struct sk_msg_md, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ if (size != sizeof(__u64))
+ return false;
+ break;
+ default:
+ if (size != sizeof(__u32))
+ return false;
+ }
+
+ if (off < 0 || off >= sizeof(struct sk_msg_md))
+ return false;
+ if (off % size != 0)
+ return false;
+
+ return true;
+}
+
+static u32 bpf_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+ int off;
+
+ switch (si->off) {
+ case offsetof(struct __sk_buff, len):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, len, 4,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, protocol):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, protocol, 2,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, vlan_proto):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, vlan_proto, 2,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, priority):
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, priority, 4,
+ target_size));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, priority, 4,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, ingress_ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, skb_iif, 4,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, dev));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct net_device, ifindex, 4,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, hash):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, hash, 4,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, mark):
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, mark, 4,
+ target_size));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, mark, 4,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, pkt_type):
+ *target_size = 1;
+ *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
+ PKT_TYPE_OFFSET());
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
+#endif
+ break;
+
+ case offsetof(struct __sk_buff, queue_mapping):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, queue_mapping, 2,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, vlan_present):
+ case offsetof(struct __sk_buff, vlan_tci):
+ BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, vlan_tci, 2,
+ target_size));
+ if (si->off == offsetof(struct __sk_buff, vlan_tci)) {
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg,
+ ~VLAN_TAG_PRESENT);
+ } else {
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1);
+ }
+ break;
+
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetofend(struct __sk_buff, cb[4]) - 1:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
+ BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
+ offsetof(struct qdisc_skb_cb, data)) %
+ sizeof(__u64));
+
+ prog->cb_access = 1;
+ off = si->off;
+ off -= offsetof(struct __sk_buff, cb[0]);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct qdisc_skb_cb, data);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
+ si->src_reg, off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
+ si->src_reg, off);
+ break;
+
+ case offsetof(struct __sk_buff, tc_classid):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2);
+
+ off = si->off;
+ off -= offsetof(struct __sk_buff, tc_classid);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct qdisc_skb_cb, tc_classid);
+ *target_size = 2;
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
+ si->src_reg, off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
+ si->src_reg, off);
+ break;
+
+ case offsetof(struct __sk_buff, data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, data));
+ break;
+
+ case offsetof(struct __sk_buff, data_meta):
+ off = si->off;
+ off -= offsetof(struct __sk_buff, data_meta);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct bpf_skb_data_end, data_meta);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+ si->src_reg, off);
+ break;
+
+ case offsetof(struct __sk_buff, data_end):
+ off = si->off;
+ off -= offsetof(struct __sk_buff, data_end);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct bpf_skb_data_end, data_end);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+ si->src_reg, off);
+ break;
+
+ case offsetof(struct __sk_buff, tc_index):
+#ifdef CONFIG_NET_SCHED
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, tc_index, 2,
+ target_size));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, tc_index, 2,
+ target_size));
+#else
+ *target_size = 2;
+ if (type == BPF_WRITE)
+ *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
+ else
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct __sk_buff, napi_id):
+#if defined(CONFIG_NET_RX_BUSY_POLL)
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, napi_id, 4,
+ target_size));
+ *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1);
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
+#else
+ *target_size = 4;
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
+#endif
+ break;
+ case offsetof(struct __sk_buff, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_family,
+ 2, target_size));
+ break;
+ case offsetof(struct __sk_buff, remote_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_daddr,
+ 4, target_size));
+ break;
+ case offsetof(struct __sk_buff, local_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_rcv_saddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_rcv_saddr,
+ 4, target_size));
+ break;
+ case offsetof(struct __sk_buff, remote_ip6[0]) ...
+ offsetof(struct __sk_buff, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct __sk_buff, remote_ip6[0]);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+ case offsetof(struct __sk_buff, local_ip6[0]) ...
+ offsetof(struct __sk_buff, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct __sk_buff, local_ip6[0]);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct __sk_buff, remote_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_dport,
+ 2, target_size));
+#ifndef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+ break;
+
+ case offsetof(struct __sk_buff, local_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_num, 2, target_size));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+ int off;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sock, bound_dev_if):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_bound_dev_if));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_bound_dev_if));
+ break;
+
+ case offsetof(struct bpf_sock, mark):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_mark) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_mark));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_mark));
+ break;
+
+ case offsetof(struct bpf_sock, priority):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_priority));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_priority));
+ break;
+
+ case offsetof(struct bpf_sock, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_family));
+ break;
+
+ case offsetof(struct bpf_sock, type):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, __sk_flags_offset));
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
+ break;
+
+ case offsetof(struct bpf_sock, protocol):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, __sk_flags_offset));
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
+ break;
+
+ case offsetof(struct bpf_sock, src_ip4):
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common, skc_rcv_saddr,
+ FIELD_SIZEOF(struct sock_common,
+ skc_rcv_saddr),
+ target_size));
+ break;
+
+ case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ off = si->off;
+ off -= offsetof(struct bpf_sock, src_ip6[0]);
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+ bpf_target_off(
+ struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0],
+ FIELD_SIZEOF(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]),
+ target_size) + off);
+#else
+ (void)off;
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct bpf_sock, src_port):
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock_common, skc_num),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common, skc_num,
+ FIELD_SIZEOF(struct sock_common,
+ skc_num),
+ target_size));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct __sk_buff, ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, dev));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct net_device, ifindex, 4,
+ target_size));
+ break;
+ default:
+ return bpf_convert_ctx_access(type, si, insn_buf, prog,
+ target_size);
+ }
+
+ return insn - insn_buf;
+}
+
+static u32 xdp_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct xdp_md, data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
+ si->dst_reg, si->src_reg,
+ offsetof(struct xdp_buff, data));
+ break;
+ case offsetof(struct xdp_md, data_meta):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
+ si->dst_reg, si->src_reg,
+ offsetof(struct xdp_buff, data_meta));
+ break;
+ case offsetof(struct xdp_md, data_end):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct xdp_buff, data_end));
+ break;
+ case offsetof(struct xdp_md, ingress_ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
+ si->dst_reg, si->src_reg,
+ offsetof(struct xdp_buff, rxq));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct xdp_rxq_info, dev));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct net_device, ifindex));
+ break;
+ case offsetof(struct xdp_md, rx_queue_index):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
+ si->dst_reg, si->src_reg,
+ offsetof(struct xdp_buff, rxq));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct xdp_rxq_info,
+ queue_index));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of
+ * context Structure, F is Field in context structure that contains a pointer
+ * to Nested Structure of type NS that has the field NF.
+ *
+ * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make
+ * sure that SIZE is not greater than actual size of S.F.NF.
+ *
+ * If offset OFF is provided, the load happens from that offset relative to
+ * offset of NF.
+ */
+#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \
+ do { \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \
+ si->src_reg, offsetof(S, F)); \
+ *insn++ = BPF_LDX_MEM( \
+ SIZE, si->dst_reg, si->dst_reg, \
+ bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \
+ target_size) \
+ + OFF); \
+ } while (0)
+
+#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \
+ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \
+ BPF_FIELD_SIZEOF(NS, NF), 0)
+
+/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
+ * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
+ *
+ * It doesn't support SIZE argument though since narrow stores are not
+ * supported for now.
+ *
+ * In addition it uses Temporary Field TF (member of struct S) as the 3rd
+ * "register" since two registers available in convert_ctx_access are not
+ * enough: we can't override neither SRC, since it contains value to store, nor
+ * DST since it contains pointer to context that may be used by later
+ * instructions. But we need a temporary place to save pointer to nested
+ * structure whose field we want to store to.
+ */
+#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \
+ do { \
+ int tmp_reg = BPF_REG_9; \
+ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \
+ --tmp_reg; \
+ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \
+ --tmp_reg; \
+ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \
+ offsetof(S, TF)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \
+ si->dst_reg, offsetof(S, F)); \
+ *insn++ = BPF_STX_MEM( \
+ BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \
+ bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \
+ target_size) \
+ + OFF); \
+ *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \
+ offsetof(S, TF)); \
+ } while (0)
+
+#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \
+ TF) \
+ do { \
+ if (type == BPF_WRITE) { \
+ SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \
+ TF); \
+ } else { \
+ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \
+ S, NS, F, NF, SIZE, OFF); \
+ } \
+ } while (0)
+
+#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \
+ S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF)
+
+static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+ int off;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sock_addr, user_family):
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
+ struct sockaddr, uaddr, sa_family);
+ break;
+
+ case offsetof(struct bpf_sock_addr, user_ip4):
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct sockaddr_in, uaddr,
+ sin_addr, BPF_SIZE(si->code), 0, tmp_reg);
+ break;
+
+ case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+ off = si->off;
+ off -= offsetof(struct bpf_sock_addr, user_ip6[0]);
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
+ sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off,
+ tmp_reg);
+ break;
+
+ case offsetof(struct bpf_sock_addr, user_port):
+ /* To get port we need to know sa_family first and then treat
+ * sockaddr as either sockaddr_in or sockaddr_in6.
+ * Though we can simplify since port field has same offset and
+ * size in both structures.
+ * Here we check this invariant and use just one of the
+ * structures if it's true.
+ */
+ BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) !=
+ offsetof(struct sockaddr_in6, sin6_port));
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sockaddr_in, sin_port) !=
+ FIELD_SIZEOF(struct sockaddr_in6, sin6_port));
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern,
+ struct sockaddr_in6, uaddr,
+ sin6_port, tmp_reg);
+ break;
+
+ case offsetof(struct bpf_sock_addr, family):
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
+ struct sock, sk, sk_family);
+ break;
+
+ case offsetof(struct bpf_sock_addr, type):
+ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct sock, sk,
+ __sk_flags_offset, BPF_W, 0);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
+ break;
+
+ case offsetof(struct bpf_sock_addr, protocol):
+ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct sock, sk,
+ __sk_flags_offset, BPF_W, 0);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
+ SK_FL_PROTO_SHIFT);
+ break;
+
+ case offsetof(struct bpf_sock_addr, msg_src_ip4):
+ /* Treat t_ctx as struct in_addr for msg_src_ip4. */
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct in_addr, t_ctx,
+ s_addr, BPF_SIZE(si->code), 0, tmp_reg);
+ break;
+
+ case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+ msg_src_ip6[3]):
+ off = si->off;
+ off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
+ /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
+ s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+ int off;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sock_ops, op) ...
+ offsetof(struct bpf_sock_ops, replylong[3]):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) !=
+ FIELD_SIZEOF(struct bpf_sock_ops_kern, op));
+ BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) !=
+ FIELD_SIZEOF(struct bpf_sock_ops_kern, reply));
+ BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) !=
+ FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong));
+ off = si->off;
+ off -= offsetof(struct bpf_sock_ops, op);
+ off += offsetof(struct bpf_sock_ops_kern, op);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ off);
+ break;
+
+ case offsetof(struct bpf_sock_ops, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_family));
+ break;
+
+ case offsetof(struct bpf_sock_ops, remote_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_daddr));
+ break;
+
+ case offsetof(struct bpf_sock_ops, local_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_rcv_saddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_rcv_saddr));
+ break;
+
+ case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
+ offsetof(struct bpf_sock_ops, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct bpf_sock_ops, remote_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
+ offsetof(struct bpf_sock_ops, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct bpf_sock_ops, local_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct bpf_sock_ops, remote_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_dport));
+#ifndef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+ break;
+
+ case offsetof(struct bpf_sock_ops, local_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_num));
+ break;
+
+ case offsetof(struct bpf_sock_ops, is_fullsock):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern,
+ is_fullsock),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ is_fullsock));
+ break;
+
+ case offsetof(struct bpf_sock_ops, state):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_state));
+ break;
+
+ case offsetof(struct bpf_sock_ops, rtt_min):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) !=
+ sizeof(struct minmax));
+ BUILD_BUG_ON(sizeof(struct minmax) <
+ sizeof(struct minmax_sample));
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct tcp_sock, rtt_min) +
+ FIELD_SIZEOF(struct minmax_sample, t));
+ break;
+
+/* Helper macro for adding read access to tcp_sock or sock fields. */
+#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
+ do { \
+ BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \
+ FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, \
+ is_fullsock), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ is_fullsock)); \
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, sk),\
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, sk));\
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \
+ OBJ_FIELD), \
+ si->dst_reg, si->dst_reg, \
+ offsetof(OBJ, OBJ_FIELD)); \
+ } while (0)
+
+/* Helper macro for adding write access to tcp_sock or sock fields.
+ * The macro is called with two registers, dst_reg which contains a pointer
+ * to ctx (context) and src_reg which contains the value that should be
+ * stored. However, we need an additional register since we cannot overwrite
+ * dst_reg because it may be used later in the program.
+ * Instead we "borrow" one of the other register. We first save its value
+ * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
+ * it at the end of the macro.
+ */
+#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
+ do { \
+ int reg = BPF_REG_9; \
+ BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \
+ FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, \
+ is_fullsock), \
+ reg, si->dst_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ is_fullsock)); \
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, sk),\
+ reg, si->dst_reg, \
+ offsetof(struct bpf_sock_ops_kern, sk));\
+ *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \
+ reg, si->src_reg, \
+ offsetof(OBJ, OBJ_FIELD)); \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ } while (0)
+
+#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \
+ do { \
+ if (TYPE == BPF_WRITE) \
+ SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
+ else \
+ SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
+ } while (0)
+
+ case offsetof(struct bpf_sock_ops, snd_cwnd):
+ SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, srtt_us):
+ SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
+ SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
+ struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, snd_ssthresh):
+ SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, rcv_nxt):
+ SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, snd_nxt):
+ SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, snd_una):
+ SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, mss_cache):
+ SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, ecn_flags):
+ SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, rate_delivered):
+ SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered,
+ struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, rate_interval_us):
+ SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us,
+ struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, packets_out):
+ SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, retrans_out):
+ SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, total_retrans):
+ SOCK_OPS_GET_FIELD(total_retrans, total_retrans,
+ struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, segs_in):
+ SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, data_segs_in):
+ SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, segs_out):
+ SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, data_segs_out):
+ SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out,
+ struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, lost_out):
+ SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, sacked_out):
+ SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, sk_txhash):
+ SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
+ struct sock, type);
+ break;
+
+ case offsetof(struct bpf_sock_ops, bytes_received):
+ SOCK_OPS_GET_FIELD(bytes_received, bytes_received,
+ struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, bytes_acked):
+ SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock);
+ break;
+
+ }
+ return insn - insn_buf;
+}
+
+static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+ int off;
+
+ switch (si->off) {
+ case offsetof(struct __sk_buff, data_end):
+ off = si->off;
+ off -= offsetof(struct __sk_buff, data_end);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct tcp_skb_cb, bpf.data_end);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+ si->src_reg, off);
+ break;
+ default:
+ return bpf_convert_ctx_access(type, si, insn_buf, prog,
+ target_size);
+ }
+
+ return insn - insn_buf;
+}
+
+static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+#if IS_ENABLED(CONFIG_IPV6)
+ int off;
+#endif
+
+ switch (si->off) {
+ case offsetof(struct sk_msg_md, data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, data));
+ break;
+ case offsetof(struct sk_msg_md, data_end):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, data_end));
+ break;
+ case offsetof(struct sk_msg_md, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_family));
+ break;
+
+ case offsetof(struct sk_msg_md, remote_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_daddr));
+ break;
+
+ case offsetof(struct sk_msg_md, local_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_rcv_saddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_rcv_saddr));
+ break;
+
+ case offsetof(struct sk_msg_md, remote_ip6[0]) ...
+ offsetof(struct sk_msg_md, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct sk_msg_md, remote_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct sk_msg_md, local_ip6[0]) ...
+ offsetof(struct sk_msg_md, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct sk_msg_md, local_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct sk_msg_md, remote_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_dport));
+#ifndef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+ break;
+
+ case offsetof(struct sk_msg_md, local_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_num));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+const struct bpf_verifier_ops sk_filter_verifier_ops = {
+ .get_func_proto = sk_filter_func_proto,
+ .is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+ .gen_ld_abs = bpf_gen_ld_abs,
+};
+
+const struct bpf_prog_ops sk_filter_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
+ .get_func_proto = tc_cls_act_func_proto,
+ .is_valid_access = tc_cls_act_is_valid_access,
+ .convert_ctx_access = tc_cls_act_convert_ctx_access,
+ .gen_prologue = tc_cls_act_prologue,
+ .gen_ld_abs = bpf_gen_ld_abs,
+};
+
+const struct bpf_prog_ops tc_cls_act_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops xdp_verifier_ops = {
+ .get_func_proto = xdp_func_proto,
+ .is_valid_access = xdp_is_valid_access,
+ .convert_ctx_access = xdp_convert_ctx_access,
+};
+
+const struct bpf_prog_ops xdp_prog_ops = {
+ .test_run = bpf_prog_test_run_xdp,
+};
+
+const struct bpf_verifier_ops cg_skb_verifier_ops = {
+ .get_func_proto = cg_skb_func_proto,
+ .is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_skb_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops lwt_in_verifier_ops = {
+ .get_func_proto = lwt_in_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_in_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops lwt_out_verifier_ops = {
+ .get_func_proto = lwt_out_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_out_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops lwt_xmit_verifier_ops = {
+ .get_func_proto = lwt_xmit_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+ .gen_prologue = tc_cls_act_prologue,
+};
+
+const struct bpf_prog_ops lwt_xmit_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
+ .get_func_proto = lwt_seg6local_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_seg6local_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops cg_sock_verifier_ops = {
+ .get_func_proto = sock_filter_func_proto,
+ .is_valid_access = sock_filter_is_valid_access,
+ .convert_ctx_access = sock_filter_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_sock_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_sock_addr_verifier_ops = {
+ .get_func_proto = sock_addr_func_proto,
+ .is_valid_access = sock_addr_is_valid_access,
+ .convert_ctx_access = sock_addr_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_sock_addr_prog_ops = {
+};
+
+const struct bpf_verifier_ops sock_ops_verifier_ops = {
+ .get_func_proto = sock_ops_func_proto,
+ .is_valid_access = sock_ops_is_valid_access,
+ .convert_ctx_access = sock_ops_convert_ctx_access,
+};
+
+const struct bpf_prog_ops sock_ops_prog_ops = {
+};
+
+const struct bpf_verifier_ops sk_skb_verifier_ops = {
+ .get_func_proto = sk_skb_func_proto,
+ .is_valid_access = sk_skb_is_valid_access,
+ .convert_ctx_access = sk_skb_convert_ctx_access,
+ .gen_prologue = sk_skb_prologue,
+};
+
+const struct bpf_prog_ops sk_skb_prog_ops = {
+};
+
+const struct bpf_verifier_ops sk_msg_verifier_ops = {
+ .get_func_proto = sk_msg_func_proto,
+ .is_valid_access = sk_msg_is_valid_access,
+ .convert_ctx_access = sk_msg_convert_ctx_access,
+};
+
+const struct bpf_prog_ops sk_msg_prog_ops = {
+};
+
+int sk_detach_filter(struct sock *sk)
+{
+ int ret = -ENOENT;
+ struct sk_filter *filter;
+
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
+ filter = rcu_dereference_protected(sk->sk_filter,
+ lockdep_sock_is_held(sk));
+ if (filter) {
+ RCU_INIT_POINTER(sk->sk_filter, NULL);
+ sk_filter_uncharge(sk, filter);
+ ret = 0;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_detach_filter);
+
+int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
+ unsigned int len)
+{
+ struct sock_fprog_kern *fprog;
+ struct sk_filter *filter;
+ int ret = 0;
+
+ lock_sock(sk);
+ filter = rcu_dereference_protected(sk->sk_filter,
+ lockdep_sock_is_held(sk));
+ if (!filter)
+ goto out;
+
+ /* We're copying the filter that has been originally attached,
+ * so no conversion/decode needed anymore. eBPF programs that
+ * have no original program cannot be dumped through this.
+ */
+ ret = -EACCES;
+ fprog = filter->prog->orig_prog;
+ if (!fprog)
+ goto out;
+
+ ret = fprog->len;
+ if (!len)
+ /* User space only enquires number of filter blocks. */
+ goto out;
+
+ ret = -EINVAL;
+ if (len < fprog->len)
+ goto out;
+
+ ret = -EFAULT;
+ if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog)))
+ goto out;
+
+ /* Instead of bytes, the API requests to return the number
+ * of filter blocks.
+ */
+ ret = fprog->len;
+out:
+ release_sock(sk);
+ return ret;
+}
+
+#ifdef CONFIG_INET
+struct sk_reuseport_kern {
+ struct sk_buff *skb;
+ struct sock *sk;
+ struct sock *selected_sk;
+ void *data_end;
+ u32 hash;
+ u32 reuseport_id;
+ bool bind_inany;
+};
+
+static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
+ struct sock_reuseport *reuse,
+ struct sock *sk, struct sk_buff *skb,
+ u32 hash)
+{
+ reuse_kern->skb = skb;
+ reuse_kern->sk = sk;
+ reuse_kern->selected_sk = NULL;
+ reuse_kern->data_end = skb->data + skb_headlen(skb);
+ reuse_kern->hash = hash;
+ reuse_kern->reuseport_id = reuse->reuseport_id;
+ reuse_kern->bind_inany = reuse->bind_inany;
+}
+
+struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
+ struct bpf_prog *prog, struct sk_buff *skb,
+ u32 hash)
+{
+ struct sk_reuseport_kern reuse_kern;
+ enum sk_action action;
+
+ bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
+ action = BPF_PROG_RUN(prog, &reuse_kern);
+
+ if (action == SK_PASS)
+ return reuse_kern.selected_sk;
+ else
+ return ERR_PTR(-ECONNREFUSED);
+}
+
+BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
+ struct bpf_map *, map, void *, key, u32, flags)
+{
+ struct sock_reuseport *reuse;
+ struct sock *selected_sk;
+
+ selected_sk = map->ops->map_lookup_elem(map, key);
+ if (!selected_sk)
+ return -ENOENT;
+
+ reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
+ if (!reuse)
+ /* selected_sk is unhashed (e.g. by close()) after the
+ * above map_lookup_elem(). Treat selected_sk has already
+ * been removed from the map.
+ */
+ return -ENOENT;
+
+ if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
+ struct sock *sk;
+
+ if (unlikely(!reuse_kern->reuseport_id))
+ /* There is a small race between adding the
+ * sk to the map and setting the
+ * reuse_kern->reuseport_id.
+ * Treat it as the sk has not been added to
+ * the bpf map yet.
+ */
+ return -ENOENT;
+
+ sk = reuse_kern->sk;
+ if (sk->sk_protocol != selected_sk->sk_protocol)
+ return -EPROTOTYPE;
+ else if (sk->sk_family != selected_sk->sk_family)
+ return -EAFNOSUPPORT;
+
+ /* Catch all. Likely bound to a different sockaddr. */
+ return -EBADFD;
+ }
+
+ reuse_kern->selected_sk = selected_sk;
+
+ return 0;
+}
+
+static const struct bpf_func_proto sk_select_reuseport_proto = {
+ .func = sk_select_reuseport,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(sk_reuseport_load_bytes,
+ const struct sk_reuseport_kern *, reuse_kern, u32, offset,
+ void *, to, u32, len)
+{
+ return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len);
+}
+
+static const struct bpf_func_proto sk_reuseport_load_bytes_proto = {
+ .func = sk_reuseport_load_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(sk_reuseport_load_bytes_relative,
+ const struct sk_reuseport_kern *, reuse_kern, u32, offset,
+ void *, to, u32, len, u32, start_header)
+{
+ return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to,
+ len, start_header);
+}
+
+static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = {
+ .func = sk_reuseport_load_bytes_relative,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+sk_reuseport_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_sk_select_reuseport:
+ return &sk_select_reuseport_proto;
+ case BPF_FUNC_skb_load_bytes:
+ return &sk_reuseport_load_bytes_proto;
+ case BPF_FUNC_skb_load_bytes_relative:
+ return &sk_reuseport_load_bytes_relative_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static bool
+sk_reuseport_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const u32 size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct sk_reuseport_md) ||
+ off % size || type != BPF_READ)
+ return false;
+
+ switch (off) {
+ case offsetof(struct sk_reuseport_md, data):
+ info->reg_type = PTR_TO_PACKET;
+ return size == sizeof(__u64);
+
+ case offsetof(struct sk_reuseport_md, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ return size == sizeof(__u64);
+
+ case offsetof(struct sk_reuseport_md, hash):
+ return size == size_default;
+
+ /* Fields that allow narrowing */
+ case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
+ if (size < FIELD_SIZEOF(struct sk_buff, protocol))
+ return false;
+ /* fall through */
+ case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
+ case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
+ case bpf_ctx_range(struct sk_reuseport_md, len):
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size, size_default);
+
+ default:
+ return false;
+ }
+}
+
+#define SK_REUSEPORT_LOAD_FIELD(F) ({ \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
+ si->dst_reg, si->src_reg, \
+ bpf_target_off(struct sk_reuseport_kern, F, \
+ FIELD_SIZEOF(struct sk_reuseport_kern, F), \
+ target_size)); \
+ })
+
+#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \
+ struct sk_buff, \
+ skb, \
+ SKB_FIELD)
+
+#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \
+ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \
+ struct sock, \
+ sk, \
+ SK_FIELD, BPF_SIZE, EXTRA_OFF)
+
+static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct sk_reuseport_md, data):
+ SK_REUSEPORT_LOAD_SKB_FIELD(data);
+ break;
+
+ case offsetof(struct sk_reuseport_md, len):
+ SK_REUSEPORT_LOAD_SKB_FIELD(len);
+ break;
+
+ case offsetof(struct sk_reuseport_md, eth_protocol):
+ SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
+ break;
+
+ case offsetof(struct sk_reuseport_md, ip_protocol):
+ BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE);
+ SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset,
+ BPF_W, 0);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
+ SK_FL_PROTO_SHIFT);
+ /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian
+ * aware. No further narrowing or masking is needed.
+ */
+ *target_size = 1;
+ break;
+
+ case offsetof(struct sk_reuseport_md, data_end):
+ SK_REUSEPORT_LOAD_FIELD(data_end);
+ break;
+
+ case offsetof(struct sk_reuseport_md, hash):
+ SK_REUSEPORT_LOAD_FIELD(hash);
+ break;
+
+ case offsetof(struct sk_reuseport_md, bind_inany):
+ SK_REUSEPORT_LOAD_FIELD(bind_inany);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
+ .get_func_proto = sk_reuseport_func_proto,
+ .is_valid_access = sk_reuseport_is_valid_access,
+ .convert_ctx_access = sk_reuseport_convert_ctx_access,
+};
+
+const struct bpf_prog_ops sk_reuseport_prog_ops = {
+};
+#endif /* CONFIG_INET */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
new file mode 100644
index 000000000..da860a680
--- /dev/null
+++ b/net/core/flow_dissector.c
@@ -0,0 +1,1470 @@
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <net/dsa.h>
+#include <net/dst_metadata.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/gre.h>
+#include <net/pptp.h>
+#include <net/tipc.h>
+#include <linux/igmp.h>
+#include <linux/icmp.h>
+#include <linux/sctp.h>
+#include <linux/dccp.h>
+#include <linux/if_tunnel.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
+#include <linux/stddef.h>
+#include <linux/if_ether.h>
+#include <linux/mpls.h>
+#include <linux/tcp.h>
+#include <net/flow_dissector.h>
+#include <scsi/fc/fc_fcoe.h>
+#include <uapi/linux/batadv_packet.h>
+
+static void dissector_set_key(struct flow_dissector *flow_dissector,
+ enum flow_dissector_key_id key_id)
+{
+ flow_dissector->used_keys |= (1 << key_id);
+}
+
+void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
+ const struct flow_dissector_key *key,
+ unsigned int key_count)
+{
+ unsigned int i;
+
+ memset(flow_dissector, 0, sizeof(*flow_dissector));
+
+ for (i = 0; i < key_count; i++, key++) {
+ /* User should make sure that every key target offset is withing
+ * boundaries of unsigned short.
+ */
+ BUG_ON(key->offset > USHRT_MAX);
+ BUG_ON(dissector_uses_key(flow_dissector,
+ key->key_id));
+
+ dissector_set_key(flow_dissector, key->key_id);
+ flow_dissector->offset[key->key_id] = key->offset;
+ }
+
+ /* Ensure that the dissector always includes control and basic key.
+ * That way we are able to avoid handling lack of these in fast path.
+ */
+ BUG_ON(!dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_CONTROL));
+ BUG_ON(!dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_BASIC));
+}
+EXPORT_SYMBOL(skb_flow_dissector_init);
+
+/**
+ * skb_flow_get_be16 - extract be16 entity
+ * @skb: sk_buff to extract from
+ * @poff: offset to extract at
+ * @data: raw buffer pointer to the packet
+ * @hlen: packet header length
+ *
+ * The function will try to retrieve a be32 entity at
+ * offset poff
+ */
+static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff,
+ void *data, int hlen)
+{
+ __be16 *u, _u;
+
+ u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u);
+ if (u)
+ return *u;
+
+ return 0;
+}
+
+/**
+ * __skb_flow_get_ports - extract the upper layer ports and return them
+ * @skb: sk_buff to extract the ports from
+ * @thoff: transport header offset
+ * @ip_proto: protocol for which to get port offset
+ * @data: raw buffer pointer to the packet, if NULL use skb->data
+ * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
+ *
+ * The function will try to retrieve the ports at offset thoff + poff where poff
+ * is the protocol port offset returned from proto_ports_offset
+ */
+__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+ void *data, int hlen)
+{
+ int poff = proto_ports_offset(ip_proto);
+
+ if (!data) {
+ data = skb->data;
+ hlen = skb_headlen(skb);
+ }
+
+ if (poff >= 0) {
+ __be32 *ports, _ports;
+
+ ports = __skb_header_pointer(skb, thoff + poff,
+ sizeof(_ports), data, hlen, &_ports);
+ if (ports)
+ return *ports;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__skb_flow_get_ports);
+
+static void
+skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct flow_dissector_key_control *ctrl;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL))
+ return;
+
+ ctrl = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_CONTROL,
+ target_container);
+ ctrl->addr_type = type;
+}
+
+void
+skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct ip_tunnel_info *info;
+ struct ip_tunnel_key *key;
+
+ /* A quick check to see if there might be something to do. */
+ if (!dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_KEYID) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_CONTROL) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_PORTS) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IP) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_OPTS))
+ return;
+
+ info = skb_tunnel_info(skb);
+ if (!info)
+ return;
+
+ key = &info->key;
+
+ switch (ip_tunnel_info_af(info)) {
+ case AF_INET:
+ skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ flow_dissector,
+ target_container);
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {
+ struct flow_dissector_key_ipv4_addrs *ipv4;
+
+ ipv4 = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
+ target_container);
+ ipv4->src = key->u.ipv4.src;
+ ipv4->dst = key->u.ipv4.dst;
+ }
+ break;
+ case AF_INET6:
+ skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ flow_dissector,
+ target_container);
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) {
+ struct flow_dissector_key_ipv6_addrs *ipv6;
+
+ ipv6 = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS,
+ target_container);
+ ipv6->src = key->u.ipv6.src;
+ ipv6->dst = key->u.ipv6.dst;
+ }
+ break;
+ }
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
+ struct flow_dissector_key_keyid *keyid;
+
+ keyid = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_KEYID,
+ target_container);
+ keyid->keyid = tunnel_id_to_key32(key->tun_id);
+ }
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
+ struct flow_dissector_key_ports *tp;
+
+ tp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_PORTS,
+ target_container);
+ tp->src = key->tp_src;
+ tp->dst = key->tp_dst;
+ }
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) {
+ struct flow_dissector_key_ip *ip;
+
+ ip = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IP,
+ target_container);
+ ip->tos = key->tos;
+ ip->ttl = key->ttl;
+ }
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) {
+ struct flow_dissector_key_enc_opts *enc_opt;
+
+ enc_opt = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_OPTS,
+ target_container);
+
+ if (info->options_len) {
+ enc_opt->len = info->options_len;
+ ip_tunnel_info_opts_get(enc_opt->data, info);
+ enc_opt->dst_opt_type = info->key.tun_flags &
+ TUNNEL_OPTIONS_PRESENT;
+ }
+ }
+}
+EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
+
+static enum flow_dissect_ret
+__skb_flow_dissect_mpls(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, void *data, int nhoff, int hlen)
+{
+ struct flow_dissector_key_keyid *key_keyid;
+ struct mpls_label *hdr, _hdr[2];
+ u32 entry, label;
+
+ if (!dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_MPLS_ENTROPY) &&
+ !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS))
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
+ hlen, &_hdr);
+ if (!hdr)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ entry = ntohl(hdr[0].entry);
+ label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) {
+ struct flow_dissector_key_mpls *key_mpls;
+
+ key_mpls = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_MPLS,
+ target_container);
+ key_mpls->mpls_label = label;
+ key_mpls->mpls_ttl = (entry & MPLS_LS_TTL_MASK)
+ >> MPLS_LS_TTL_SHIFT;
+ key_mpls->mpls_tc = (entry & MPLS_LS_TC_MASK)
+ >> MPLS_LS_TC_SHIFT;
+ key_mpls->mpls_bos = (entry & MPLS_LS_S_MASK)
+ >> MPLS_LS_S_SHIFT;
+ }
+
+ if (label == MPLS_LABEL_ENTROPY) {
+ key_keyid = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
+ target_container);
+ key_keyid->keyid = hdr[1].entry & htonl(MPLS_LS_LABEL_MASK);
+ }
+ return FLOW_DISSECT_RET_OUT_GOOD;
+}
+
+static enum flow_dissect_ret
+__skb_flow_dissect_arp(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, void *data, int nhoff, int hlen)
+{
+ struct flow_dissector_key_arp *key_arp;
+ struct {
+ unsigned char ar_sha[ETH_ALEN];
+ unsigned char ar_sip[4];
+ unsigned char ar_tha[ETH_ALEN];
+ unsigned char ar_tip[4];
+ } *arp_eth, _arp_eth;
+ const struct arphdr *arp;
+ struct arphdr _arp;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP))
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
+ hlen, &_arp);
+ if (!arp)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
+ arp->ar_pro != htons(ETH_P_IP) ||
+ arp->ar_hln != ETH_ALEN ||
+ arp->ar_pln != 4 ||
+ (arp->ar_op != htons(ARPOP_REPLY) &&
+ arp->ar_op != htons(ARPOP_REQUEST)))
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
+ sizeof(_arp_eth), data,
+ hlen, &_arp_eth);
+ if (!arp_eth)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ key_arp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ARP,
+ target_container);
+
+ memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip));
+ memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip));
+
+ /* Only store the lower byte of the opcode;
+ * this covers ARPOP_REPLY and ARPOP_REQUEST.
+ */
+ key_arp->op = ntohs(arp->ar_op) & 0xff;
+
+ ether_addr_copy(key_arp->sha, arp_eth->ar_sha);
+ ether_addr_copy(key_arp->tha, arp_eth->ar_tha);
+
+ return FLOW_DISSECT_RET_OUT_GOOD;
+}
+
+static enum flow_dissect_ret
+__skb_flow_dissect_gre(const struct sk_buff *skb,
+ struct flow_dissector_key_control *key_control,
+ struct flow_dissector *flow_dissector,
+ void *target_container, void *data,
+ __be16 *p_proto, int *p_nhoff, int *p_hlen,
+ unsigned int flags)
+{
+ struct flow_dissector_key_keyid *key_keyid;
+ struct gre_base_hdr *hdr, _hdr;
+ int offset = 0;
+ u16 gre_ver;
+
+ hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr),
+ data, *p_hlen, &_hdr);
+ if (!hdr)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ /* Only look inside GRE without routing */
+ if (hdr->flags & GRE_ROUTING)
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ /* Only look inside GRE for version 0 and 1 */
+ gre_ver = ntohs(hdr->flags & GRE_VERSION);
+ if (gre_ver > 1)
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ *p_proto = hdr->protocol;
+ if (gre_ver) {
+ /* Version1 must be PPTP, and check the flags */
+ if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY)))
+ return FLOW_DISSECT_RET_OUT_GOOD;
+ }
+
+ offset += sizeof(struct gre_base_hdr);
+
+ if (hdr->flags & GRE_CSUM)
+ offset += sizeof(((struct gre_full_hdr *) 0)->csum) +
+ sizeof(((struct gre_full_hdr *) 0)->reserved1);
+
+ if (hdr->flags & GRE_KEY) {
+ const __be32 *keyid;
+ __be32 _keyid;
+
+ keyid = __skb_header_pointer(skb, *p_nhoff + offset,
+ sizeof(_keyid),
+ data, *p_hlen, &_keyid);
+ if (!keyid)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_GRE_KEYID)) {
+ key_keyid = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_GRE_KEYID,
+ target_container);
+ if (gre_ver == 0)
+ key_keyid->keyid = *keyid;
+ else
+ key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK;
+ }
+ offset += sizeof(((struct gre_full_hdr *) 0)->key);
+ }
+
+ if (hdr->flags & GRE_SEQ)
+ offset += sizeof(((struct pptp_gre_header *) 0)->seq);
+
+ if (gre_ver == 0) {
+ if (*p_proto == htons(ETH_P_TEB)) {
+ const struct ethhdr *eth;
+ struct ethhdr _eth;
+
+ eth = __skb_header_pointer(skb, *p_nhoff + offset,
+ sizeof(_eth),
+ data, *p_hlen, &_eth);
+ if (!eth)
+ return FLOW_DISSECT_RET_OUT_BAD;
+ *p_proto = eth->h_proto;
+ offset += sizeof(*eth);
+
+ /* Cap headers that we access via pointers at the
+ * end of the Ethernet header as our maximum alignment
+ * at that point is only 2 bytes.
+ */
+ if (NET_IP_ALIGN)
+ *p_hlen = *p_nhoff + offset;
+ }
+ } else { /* version 1, must be PPTP */
+ u8 _ppp_hdr[PPP_HDRLEN];
+ u8 *ppp_hdr;
+
+ if (hdr->flags & GRE_ACK)
+ offset += sizeof(((struct pptp_gre_header *) 0)->ack);
+
+ ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset,
+ sizeof(_ppp_hdr),
+ data, *p_hlen, _ppp_hdr);
+ if (!ppp_hdr)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ switch (PPP_PROTOCOL(ppp_hdr)) {
+ case PPP_IP:
+ *p_proto = htons(ETH_P_IP);
+ break;
+ case PPP_IPV6:
+ *p_proto = htons(ETH_P_IPV6);
+ break;
+ default:
+ /* Could probably catch some more like MPLS */
+ break;
+ }
+
+ offset += PPP_HDRLEN;
+ }
+
+ *p_nhoff += offset;
+ key_control->flags |= FLOW_DIS_ENCAPSULATION;
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ return FLOW_DISSECT_RET_PROTO_AGAIN;
+}
+
+/**
+ * __skb_flow_dissect_batadv() - dissect batman-adv header
+ * @skb: sk_buff to with the batman-adv header
+ * @key_control: flow dissectors control key
+ * @data: raw buffer pointer to the packet, if NULL use skb->data
+ * @p_proto: pointer used to update the protocol to process next
+ * @p_nhoff: pointer used to update inner network header offset
+ * @hlen: packet header length
+ * @flags: any combination of FLOW_DISSECTOR_F_*
+ *
+ * ETH_P_BATMAN packets are tried to be dissected. Only
+ * &struct batadv_unicast packets are actually processed because they contain an
+ * inner ethernet header and are usually followed by actual network header. This
+ * allows the flow dissector to continue processing the packet.
+ *
+ * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found,
+ * FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation,
+ * otherwise FLOW_DISSECT_RET_OUT_BAD
+ */
+static enum flow_dissect_ret
+__skb_flow_dissect_batadv(const struct sk_buff *skb,
+ struct flow_dissector_key_control *key_control,
+ void *data, __be16 *p_proto, int *p_nhoff, int hlen,
+ unsigned int flags)
+{
+ struct {
+ struct batadv_unicast_packet batadv_unicast;
+ struct ethhdr eth;
+ } *hdr, _hdr;
+
+ hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen,
+ &_hdr);
+ if (!hdr)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ if (hdr->batadv_unicast.packet_type != BATADV_UNICAST)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ *p_proto = hdr->eth.h_proto;
+ *p_nhoff += sizeof(*hdr);
+
+ key_control->flags |= FLOW_DIS_ENCAPSULATION;
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ return FLOW_DISSECT_RET_PROTO_AGAIN;
+}
+
+static void
+__skb_flow_dissect_tcp(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, void *data, int thoff, int hlen)
+{
+ struct flow_dissector_key_tcp *key_tcp;
+ struct tcphdr *th, _th;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP))
+ return;
+
+ th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th);
+ if (!th)
+ return;
+
+ if (unlikely(__tcp_hdrlen(th) < sizeof(_th)))
+ return;
+
+ key_tcp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_TCP,
+ target_container);
+ key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF));
+}
+
+static void
+__skb_flow_dissect_ipv4(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, void *data, const struct iphdr *iph)
+{
+ struct flow_dissector_key_ip *key_ip;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP))
+ return;
+
+ key_ip = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IP,
+ target_container);
+ key_ip->tos = iph->tos;
+ key_ip->ttl = iph->ttl;
+}
+
+static void
+__skb_flow_dissect_ipv6(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, void *data, const struct ipv6hdr *iph)
+{
+ struct flow_dissector_key_ip *key_ip;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP))
+ return;
+
+ key_ip = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IP,
+ target_container);
+ key_ip->tos = ipv6_get_dsfield(iph);
+ key_ip->ttl = iph->hop_limit;
+}
+
+/* Maximum number of protocol headers that can be parsed in
+ * __skb_flow_dissect
+ */
+#define MAX_FLOW_DISSECT_HDRS 15
+
+static bool skb_flow_dissect_allowed(int *num_hdrs)
+{
+ ++*num_hdrs;
+
+ return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS);
+}
+
+/**
+ * __skb_flow_dissect - extract the flow_keys struct and return it
+ * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
+ * @flow_dissector: list of keys to dissect
+ * @target_container: target structure to put dissected values into
+ * @data: raw buffer pointer to the packet, if NULL use skb->data
+ * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
+ * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
+ * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
+ *
+ * The function will try to retrieve individual keys into target specified
+ * by flow_dissector from either the skbuff or a raw buffer specified by the
+ * rest parameters.
+ *
+ * Caller must take care of zeroing target container memory.
+ */
+bool __skb_flow_dissect(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container,
+ void *data, __be16 proto, int nhoff, int hlen,
+ unsigned int flags)
+{
+ struct flow_dissector_key_control *key_control;
+ struct flow_dissector_key_basic *key_basic;
+ struct flow_dissector_key_addrs *key_addrs;
+ struct flow_dissector_key_ports *key_ports;
+ struct flow_dissector_key_icmp *key_icmp;
+ struct flow_dissector_key_tags *key_tags;
+ struct flow_dissector_key_vlan *key_vlan;
+ enum flow_dissect_ret fdret;
+ enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
+ int num_hdrs = 0;
+ u8 ip_proto = 0;
+ bool ret;
+
+ if (!data) {
+ data = skb->data;
+ proto = skb_vlan_tag_present(skb) ?
+ skb->vlan_proto : skb->protocol;
+ nhoff = skb_network_offset(skb);
+ hlen = skb_headlen(skb);
+#if IS_ENABLED(CONFIG_NET_DSA)
+ if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) &&
+ proto == htons(ETH_P_XDSA))) {
+ const struct dsa_device_ops *ops;
+ int offset = 0;
+
+ ops = skb->dev->dsa_ptr->tag_ops;
+ if (ops->flow_dissect &&
+ !ops->flow_dissect(skb, &proto, &offset)) {
+ hlen -= offset;
+ nhoff += offset;
+ }
+ }
+#endif
+ }
+
+ /* It is ensured by skb_flow_dissector_init() that control key will
+ * be always present.
+ */
+ key_control = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_CONTROL,
+ target_container);
+
+ /* It is ensured by skb_flow_dissector_init() that basic key will
+ * be always present.
+ */
+ key_basic = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_BASIC,
+ target_container);
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+ struct ethhdr *eth = eth_hdr(skb);
+ struct flow_dissector_key_eth_addrs *key_eth_addrs;
+
+ key_eth_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ETH_ADDRS,
+ target_container);
+ memcpy(key_eth_addrs, &eth->h_dest, sizeof(*key_eth_addrs));
+ }
+
+proto_again:
+ fdret = FLOW_DISSECT_RET_CONTINUE;
+
+ switch (proto) {
+ case htons(ETH_P_IP): {
+ const struct iphdr *iph;
+ struct iphdr _iph;
+
+ iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
+ if (!iph || iph->ihl < 5) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ nhoff += iph->ihl * 4;
+
+ ip_proto = iph->protocol;
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ target_container);
+
+ memcpy(&key_addrs->v4addrs.src, &iph->saddr,
+ sizeof(key_addrs->v4addrs.src));
+ memcpy(&key_addrs->v4addrs.dst, &iph->daddr,
+ sizeof(key_addrs->v4addrs.dst));
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ }
+
+ if (ip_is_fragment(iph)) {
+ key_control->flags |= FLOW_DIS_IS_FRAGMENT;
+
+ if (iph->frag_off & htons(IP_OFFSET)) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ } else {
+ key_control->flags |= FLOW_DIS_FIRST_FRAG;
+ if (!(flags &
+ FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+ }
+ }
+
+ __skb_flow_dissect_ipv4(skb, flow_dissector,
+ target_container, data, iph);
+
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
+ break;
+ }
+ case htons(ETH_P_IPV6): {
+ const struct ipv6hdr *iph;
+ struct ipv6hdr _iph;
+
+ iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
+ if (!iph) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ ip_proto = iph->nexthdr;
+ nhoff += sizeof(struct ipv6hdr);
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ target_container);
+
+ memcpy(&key_addrs->v6addrs.src, &iph->saddr,
+ sizeof(key_addrs->v6addrs.src));
+ memcpy(&key_addrs->v6addrs.dst, &iph->daddr,
+ sizeof(key_addrs->v6addrs.dst));
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ }
+
+ if ((dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL) ||
+ (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) &&
+ ip6_flowlabel(iph)) {
+ __be32 flow_label = ip6_flowlabel(iph);
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
+ key_tags = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL,
+ target_container);
+ key_tags->flow_label = ntohl(flow_label);
+ }
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+ }
+
+ __skb_flow_dissect_ipv6(skb, flow_dissector,
+ target_container, data, iph);
+
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+
+ break;
+ }
+ case htons(ETH_P_8021AD):
+ case htons(ETH_P_8021Q): {
+ const struct vlan_hdr *vlan = NULL;
+ struct vlan_hdr _vlan;
+ __be16 saved_vlan_tpid = proto;
+
+ if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX &&
+ skb && skb_vlan_tag_present(skb)) {
+ proto = skb->protocol;
+ } else {
+ vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan),
+ data, hlen, &_vlan);
+ if (!vlan) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ proto = vlan->h_vlan_encapsulated_proto;
+ nhoff += sizeof(*vlan);
+ }
+
+ if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) {
+ dissector_vlan = FLOW_DISSECTOR_KEY_VLAN;
+ } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) {
+ dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN;
+ } else {
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+ }
+
+ if (dissector_uses_key(flow_dissector, dissector_vlan)) {
+ key_vlan = skb_flow_dissector_target(flow_dissector,
+ dissector_vlan,
+ target_container);
+
+ if (!vlan) {
+ key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
+ key_vlan->vlan_priority =
+ (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT);
+ } else {
+ key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) &
+ VLAN_VID_MASK;
+ key_vlan->vlan_priority =
+ (ntohs(vlan->h_vlan_TCI) &
+ VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+ }
+ key_vlan->vlan_tpid = saved_vlan_tpid;
+ key_vlan->vlan_eth_type = proto;
+ }
+
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+ }
+ case htons(ETH_P_PPP_SES): {
+ struct {
+ struct pppoe_hdr hdr;
+ __be16 proto;
+ } *hdr, _hdr;
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+ if (!hdr) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ proto = hdr->proto;
+ nhoff += PPPOE_SES_HLEN;
+ switch (proto) {
+ case htons(PPP_IP):
+ proto = htons(ETH_P_IP);
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+ case htons(PPP_IPV6):
+ proto = htons(ETH_P_IPV6);
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+ default:
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+ break;
+ }
+ case htons(ETH_P_TIPC): {
+ struct tipc_basic_hdr *hdr, _hdr;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr),
+ data, hlen, &_hdr);
+ if (!hdr) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_TIPC)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_TIPC,
+ target_container);
+ key_addrs->tipckey.key = tipc_hdr_rps_key(hdr);
+ key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC;
+ }
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
+ case htons(ETH_P_MPLS_UC):
+ case htons(ETH_P_MPLS_MC):
+ fdret = __skb_flow_dissect_mpls(skb, flow_dissector,
+ target_container, data,
+ nhoff, hlen);
+ break;
+ case htons(ETH_P_FCOE):
+ if ((hlen - nhoff) < FCOE_HEADER_LEN) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ nhoff += FCOE_HEADER_LEN;
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_RARP):
+ fdret = __skb_flow_dissect_arp(skb, flow_dissector,
+ target_container, data,
+ nhoff, hlen);
+ break;
+
+ case htons(ETH_P_BATMAN):
+ fdret = __skb_flow_dissect_batadv(skb, key_control, data,
+ &proto, &nhoff, hlen, flags);
+ break;
+
+ default:
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ /* Process result of proto processing */
+ switch (fdret) {
+ case FLOW_DISSECT_RET_OUT_GOOD:
+ goto out_good;
+ case FLOW_DISSECT_RET_PROTO_AGAIN:
+ if (skb_flow_dissect_allowed(&num_hdrs))
+ goto proto_again;
+ goto out_good;
+ case FLOW_DISSECT_RET_CONTINUE:
+ case FLOW_DISSECT_RET_IPPROTO_AGAIN:
+ break;
+ case FLOW_DISSECT_RET_OUT_BAD:
+ default:
+ goto out_bad;
+ }
+
+ip_proto_again:
+ fdret = FLOW_DISSECT_RET_CONTINUE;
+
+ switch (ip_proto) {
+ case IPPROTO_GRE:
+ fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector,
+ target_container, data,
+ &proto, &nhoff, &hlen, flags);
+ break;
+
+ case NEXTHDR_HOP:
+ case NEXTHDR_ROUTING:
+ case NEXTHDR_DEST: {
+ u8 _opthdr[2], *opthdr;
+
+ if (proto != htons(ETH_P_IPV6))
+ break;
+
+ opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr),
+ data, hlen, &_opthdr);
+ if (!opthdr) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ ip_proto = opthdr[0];
+ nhoff += (opthdr[1] + 1) << 3;
+
+ fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN;
+ break;
+ }
+ case NEXTHDR_FRAGMENT: {
+ struct frag_hdr _fh, *fh;
+
+ if (proto != htons(ETH_P_IPV6))
+ break;
+
+ fh = __skb_header_pointer(skb, nhoff, sizeof(_fh),
+ data, hlen, &_fh);
+
+ if (!fh) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ key_control->flags |= FLOW_DIS_IS_FRAGMENT;
+
+ nhoff += sizeof(_fh);
+ ip_proto = fh->nexthdr;
+
+ if (!(fh->frag_off & htons(IP6_OFFSET))) {
+ key_control->flags |= FLOW_DIS_FIRST_FRAG;
+ if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) {
+ fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN;
+ break;
+ }
+ }
+
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+ case IPPROTO_IPIP:
+ proto = htons(ETH_P_IP);
+
+ key_control->flags |= FLOW_DIS_ENCAPSULATION;
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+
+ case IPPROTO_IPV6:
+ proto = htons(ETH_P_IPV6);
+
+ key_control->flags |= FLOW_DIS_ENCAPSULATION;
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+
+
+ case IPPROTO_MPLS:
+ proto = htons(ETH_P_MPLS_UC);
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+
+ case IPPROTO_TCP:
+ __skb_flow_dissect_tcp(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
+ break;
+
+ default:
+ break;
+ }
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS) &&
+ !(key_control->flags & FLOW_DIS_IS_FRAGMENT)) {
+ key_ports = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS,
+ target_container);
+ key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
+ data, hlen);
+ }
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ICMP)) {
+ key_icmp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ICMP,
+ target_container);
+ key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen);
+ }
+
+ /* Process result of IP proto processing */
+ switch (fdret) {
+ case FLOW_DISSECT_RET_PROTO_AGAIN:
+ if (skb_flow_dissect_allowed(&num_hdrs))
+ goto proto_again;
+ break;
+ case FLOW_DISSECT_RET_IPPROTO_AGAIN:
+ if (skb_flow_dissect_allowed(&num_hdrs))
+ goto ip_proto_again;
+ break;
+ case FLOW_DISSECT_RET_OUT_GOOD:
+ case FLOW_DISSECT_RET_CONTINUE:
+ break;
+ case FLOW_DISSECT_RET_OUT_BAD:
+ default:
+ goto out_bad;
+ }
+
+out_good:
+ ret = true;
+
+out:
+ key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen);
+ key_basic->n_proto = proto;
+ key_basic->ip_proto = ip_proto;
+
+ return ret;
+
+out_bad:
+ ret = false;
+ goto out;
+}
+EXPORT_SYMBOL(__skb_flow_dissect);
+
+static siphash_key_t hashrnd __read_mostly;
+static __always_inline void __flow_hash_secret_init(void)
+{
+ net_get_random_once(&hashrnd, sizeof(hashrnd));
+}
+
+static const void *flow_keys_hash_start(const struct flow_keys *flow)
+{
+ BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT);
+ return &flow->FLOW_KEYS_HASH_START_FIELD;
+}
+
+static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
+{
+ size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);
+ BUILD_BUG_ON(offsetof(typeof(*flow), addrs) !=
+ sizeof(*flow) - sizeof(flow->addrs));
+
+ switch (flow->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ diff -= sizeof(flow->addrs.v4addrs);
+ break;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ diff -= sizeof(flow->addrs.v6addrs);
+ break;
+ case FLOW_DISSECTOR_KEY_TIPC:
+ diff -= sizeof(flow->addrs.tipckey);
+ break;
+ }
+ return sizeof(*flow) - diff;
+}
+
+__be32 flow_get_u32_src(const struct flow_keys *flow)
+{
+ switch (flow->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ return flow->addrs.v4addrs.src;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ return (__force __be32)ipv6_addr_hash(
+ &flow->addrs.v6addrs.src);
+ case FLOW_DISSECTOR_KEY_TIPC:
+ return flow->addrs.tipckey.key;
+ default:
+ return 0;
+ }
+}
+EXPORT_SYMBOL(flow_get_u32_src);
+
+__be32 flow_get_u32_dst(const struct flow_keys *flow)
+{
+ switch (flow->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ return flow->addrs.v4addrs.dst;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ return (__force __be32)ipv6_addr_hash(
+ &flow->addrs.v6addrs.dst);
+ default:
+ return 0;
+ }
+}
+EXPORT_SYMBOL(flow_get_u32_dst);
+
+static inline void __flow_hash_consistentify(struct flow_keys *keys)
+{
+ int addr_diff, i;
+
+ switch (keys->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ addr_diff = (__force u32)keys->addrs.v4addrs.dst -
+ (__force u32)keys->addrs.v4addrs.src;
+ if ((addr_diff < 0) ||
+ (addr_diff == 0 &&
+ ((__force u16)keys->ports.dst <
+ (__force u16)keys->ports.src))) {
+ swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst);
+ swap(keys->ports.src, keys->ports.dst);
+ }
+ break;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ addr_diff = memcmp(&keys->addrs.v6addrs.dst,
+ &keys->addrs.v6addrs.src,
+ sizeof(keys->addrs.v6addrs.dst));
+ if ((addr_diff < 0) ||
+ (addr_diff == 0 &&
+ ((__force u16)keys->ports.dst <
+ (__force u16)keys->ports.src))) {
+ for (i = 0; i < 4; i++)
+ swap(keys->addrs.v6addrs.src.s6_addr32[i],
+ keys->addrs.v6addrs.dst.s6_addr32[i]);
+ swap(keys->ports.src, keys->ports.dst);
+ }
+ break;
+ }
+}
+
+static inline u32 __flow_hash_from_keys(struct flow_keys *keys,
+ const siphash_key_t *keyval)
+{
+ u32 hash;
+
+ __flow_hash_consistentify(keys);
+
+ hash = siphash(flow_keys_hash_start(keys),
+ flow_keys_hash_length(keys), keyval);
+ if (!hash)
+ hash = 1;
+
+ return hash;
+}
+
+u32 flow_hash_from_keys(struct flow_keys *keys)
+{
+ __flow_hash_secret_init();
+ return __flow_hash_from_keys(keys, &hashrnd);
+}
+EXPORT_SYMBOL(flow_hash_from_keys);
+
+static inline u32 ___skb_get_hash(const struct sk_buff *skb,
+ struct flow_keys *keys,
+ const siphash_key_t *keyval)
+{
+ skb_flow_dissect_flow_keys(skb, keys,
+ FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
+ return __flow_hash_from_keys(keys, keyval);
+}
+
+struct _flow_keys_digest_data {
+ __be16 n_proto;
+ u8 ip_proto;
+ u8 padding;
+ __be32 ports;
+ __be32 src;
+ __be32 dst;
+};
+
+void make_flow_keys_digest(struct flow_keys_digest *digest,
+ const struct flow_keys *flow)
+{
+ struct _flow_keys_digest_data *data =
+ (struct _flow_keys_digest_data *)digest;
+
+ BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));
+
+ memset(digest, 0, sizeof(*digest));
+
+ data->n_proto = flow->basic.n_proto;
+ data->ip_proto = flow->basic.ip_proto;
+ data->ports = flow->ports.ports;
+ data->src = flow->addrs.v4addrs.src;
+ data->dst = flow->addrs.v4addrs.dst;
+}
+EXPORT_SYMBOL(make_flow_keys_digest);
+
+static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
+
+u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
+{
+ struct flow_keys keys;
+
+ __flow_hash_secret_init();
+
+ memset(&keys, 0, sizeof(keys));
+ __skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys,
+ NULL, 0, 0, 0,
+ FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
+ return __flow_hash_from_keys(&keys, &hashrnd);
+}
+EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);
+
+/**
+ * __skb_get_hash: calculate a flow hash
+ * @skb: sk_buff to calculate flow hash from
+ *
+ * This function calculates a flow hash based on src/dst addresses
+ * and src/dst port numbers. Sets hash in skb to non-zero hash value
+ * on success, zero indicates no valid hash. Also, sets l4_hash in skb
+ * if hash is a canonical 4-tuple hash over transport ports.
+ */
+void __skb_get_hash(struct sk_buff *skb)
+{
+ struct flow_keys keys;
+ u32 hash;
+
+ __flow_hash_secret_init();
+
+ hash = ___skb_get_hash(skb, &keys, &hashrnd);
+
+ __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
+}
+EXPORT_SYMBOL(__skb_get_hash);
+
+__u32 skb_get_hash_perturb(const struct sk_buff *skb,
+ const siphash_key_t *perturb)
+{
+ struct flow_keys keys;
+
+ return ___skb_get_hash(skb, &keys, perturb);
+}
+EXPORT_SYMBOL(skb_get_hash_perturb);
+
+u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+ const struct flow_keys_basic *keys, int hlen)
+{
+ u32 poff = keys->control.thoff;
+
+ /* skip L4 headers for fragments after the first */
+ if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) &&
+ !(keys->control.flags & FLOW_DIS_FIRST_FRAG))
+ return poff;
+
+ switch (keys->basic.ip_proto) {
+ case IPPROTO_TCP: {
+ /* access doff as u8 to avoid unaligned access */
+ const u8 *doff;
+ u8 _doff;
+
+ doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff),
+ data, hlen, &_doff);
+ if (!doff)
+ return poff;
+
+ poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2);
+ break;
+ }
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ poff += sizeof(struct udphdr);
+ break;
+ /* For the rest, we do not really care about header
+ * extensions at this point for now.
+ */
+ case IPPROTO_ICMP:
+ poff += sizeof(struct icmphdr);
+ break;
+ case IPPROTO_ICMPV6:
+ poff += sizeof(struct icmp6hdr);
+ break;
+ case IPPROTO_IGMP:
+ poff += sizeof(struct igmphdr);
+ break;
+ case IPPROTO_DCCP:
+ poff += sizeof(struct dccp_hdr);
+ break;
+ case IPPROTO_SCTP:
+ poff += sizeof(struct sctphdr);
+ break;
+ }
+
+ return poff;
+}
+
+/**
+ * skb_get_poff - get the offset to the payload
+ * @skb: sk_buff to get the payload offset from
+ *
+ * The function will get the offset to the payload as far as it could
+ * be dissected. The main user is currently BPF, so that we can dynamically
+ * truncate packets without needing to push actual payload to the user
+ * space and can analyze headers only, instead.
+ */
+u32 skb_get_poff(const struct sk_buff *skb)
+{
+ struct flow_keys_basic keys;
+
+ if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0))
+ return 0;
+
+ return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
+}
+
+__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
+{
+ memset(keys, 0, sizeof(*keys));
+
+ memcpy(&keys->addrs.v6addrs.src, &fl6->saddr,
+ sizeof(keys->addrs.v6addrs.src));
+ memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr,
+ sizeof(keys->addrs.v6addrs.dst));
+ keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ keys->ports.src = fl6->fl6_sport;
+ keys->ports.dst = fl6->fl6_dport;
+ keys->keyid.keyid = fl6->fl6_gre_key;
+ keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
+ keys->basic.ip_proto = fl6->flowi6_proto;
+
+ return flow_hash_from_keys(keys);
+}
+EXPORT_SYMBOL(__get_hash_from_flowi6);
+
+static const struct flow_dissector_key flow_keys_dissector_keys[] = {
+ {
+ .key_id = FLOW_DISSECTOR_KEY_CONTROL,
+ .offset = offsetof(struct flow_keys, control),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_BASIC,
+ .offset = offsetof(struct flow_keys, basic),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ .offset = offsetof(struct flow_keys, addrs.v4addrs),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ .offset = offsetof(struct flow_keys, addrs.v6addrs),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_TIPC,
+ .offset = offsetof(struct flow_keys, addrs.tipckey),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_PORTS,
+ .offset = offsetof(struct flow_keys, ports),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_VLAN,
+ .offset = offsetof(struct flow_keys, vlan),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
+ .offset = offsetof(struct flow_keys, tags),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
+ .offset = offsetof(struct flow_keys, keyid),
+ },
+};
+
+static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = {
+ {
+ .key_id = FLOW_DISSECTOR_KEY_CONTROL,
+ .offset = offsetof(struct flow_keys, control),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_BASIC,
+ .offset = offsetof(struct flow_keys, basic),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ .offset = offsetof(struct flow_keys, addrs.v4addrs),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ .offset = offsetof(struct flow_keys, addrs.v6addrs),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_PORTS,
+ .offset = offsetof(struct flow_keys, ports),
+ },
+};
+
+static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = {
+ {
+ .key_id = FLOW_DISSECTOR_KEY_CONTROL,
+ .offset = offsetof(struct flow_keys, control),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_BASIC,
+ .offset = offsetof(struct flow_keys, basic),
+ },
+};
+
+struct flow_dissector flow_keys_dissector __read_mostly;
+EXPORT_SYMBOL(flow_keys_dissector);
+
+struct flow_dissector flow_keys_basic_dissector __read_mostly;
+EXPORT_SYMBOL(flow_keys_basic_dissector);
+
+static int __init init_default_flow_dissectors(void)
+{
+ skb_flow_dissector_init(&flow_keys_dissector,
+ flow_keys_dissector_keys,
+ ARRAY_SIZE(flow_keys_dissector_keys));
+ skb_flow_dissector_init(&flow_keys_dissector_symmetric,
+ flow_keys_dissector_symmetric_keys,
+ ARRAY_SIZE(flow_keys_dissector_symmetric_keys));
+ skb_flow_dissector_init(&flow_keys_basic_dissector,
+ flow_keys_basic_dissector_keys,
+ ARRAY_SIZE(flow_keys_basic_dissector_keys));
+ return 0;
+}
+
+core_initcall(init_default_flow_dissectors);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
new file mode 100644
index 000000000..752744db1
--- /dev/null
+++ b/net/core/gen_estimator.c
@@ -0,0 +1,274 @@
+/*
+ * net/sched/gen_estimator.c Simple rate estimator.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Eric Dumazet <edumazet@google.com>
+ *
+ * Changes:
+ * Jamal Hadi Salim - moved it to net/core and reshulfed
+ * names to make it usable in general net subsystem.
+ */
+
+#include <linux/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/seqlock.h>
+#include <net/sock.h>
+#include <net/gen_stats.h>
+
+/* This code is NOT intended to be used for statistics collection,
+ * its purpose is to provide a base for statistical multiplexing
+ * for controlled load service.
+ * If you need only statistics, run a user level daemon which
+ * periodically reads byte counters.
+ */
+
+struct net_rate_estimator {
+ struct gnet_stats_basic_packed *bstats;
+ spinlock_t *stats_lock;
+ seqcount_t *running;
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+ u8 ewma_log;
+ u8 intvl_log; /* period : (250ms << intvl_log) */
+
+ seqcount_t seq;
+ u32 last_packets;
+ u64 last_bytes;
+
+ u64 avpps;
+ u64 avbps;
+
+ unsigned long next_jiffies;
+ struct timer_list timer;
+ struct rcu_head rcu;
+};
+
+static void est_fetch_counters(struct net_rate_estimator *e,
+ struct gnet_stats_basic_packed *b)
+{
+ memset(b, 0, sizeof(*b));
+ if (e->stats_lock)
+ spin_lock(e->stats_lock);
+
+ __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats);
+
+ if (e->stats_lock)
+ spin_unlock(e->stats_lock);
+
+}
+
+static void est_timer(struct timer_list *t)
+{
+ struct net_rate_estimator *est = from_timer(est, t, timer);
+ struct gnet_stats_basic_packed b;
+ u64 rate, brate;
+
+ est_fetch_counters(est, &b);
+ brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log);
+ brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log);
+
+ rate = (u64)(b.packets - est->last_packets) << (10 - est->intvl_log);
+ rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log);
+
+ write_seqcount_begin(&est->seq);
+ est->avbps += brate;
+ est->avpps += rate;
+ write_seqcount_end(&est->seq);
+
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
+
+ est->next_jiffies += ((HZ/4) << est->intvl_log);
+
+ if (unlikely(time_after_eq(jiffies, est->next_jiffies))) {
+ /* Ouch... timer was delayed. */
+ est->next_jiffies = jiffies + 1;
+ }
+ mod_timer(&est->timer, est->next_jiffies);
+}
+
+/**
+ * gen_new_estimator - create a new rate estimator
+ * @bstats: basic statistics
+ * @cpu_bstats: bstats per cpu
+ * @rate_est: rate estimator statistics
+ * @lock: lock for statistics and control path
+ * @running: qdisc running seqcount
+ * @opt: rate estimator configuration TLV
+ *
+ * Creates a new rate estimator with &bstats as source and &rate_est
+ * as destination. A new timer with the interval specified in the
+ * configuration TLV is created. Upon each interval, the latest statistics
+ * will be read from &bstats and the estimated rate will be stored in
+ * &rate_est with the statistics lock grabbed during this period.
+ *
+ * Returns 0 on success or a negative error code.
+ *
+ */
+int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats,
+ struct net_rate_estimator __rcu **rate_est,
+ spinlock_t *lock,
+ seqcount_t *running,
+ struct nlattr *opt)
+{
+ struct gnet_estimator *parm = nla_data(opt);
+ struct net_rate_estimator *old, *est;
+ struct gnet_stats_basic_packed b;
+ int intvl_log;
+
+ if (nla_len(opt) < sizeof(*parm))
+ return -EINVAL;
+
+ /* allowed timer periods are :
+ * -2 : 250ms, -1 : 500ms, 0 : 1 sec
+ * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec
+ */
+ if (parm->interval < -2 || parm->interval > 3)
+ return -EINVAL;
+
+ if (parm->ewma_log == 0 || parm->ewma_log >= 31)
+ return -EINVAL;
+
+ est = kzalloc(sizeof(*est), GFP_KERNEL);
+ if (!est)
+ return -ENOBUFS;
+
+ seqcount_init(&est->seq);
+ intvl_log = parm->interval + 2;
+ est->bstats = bstats;
+ est->stats_lock = lock;
+ est->running = running;
+ est->ewma_log = parm->ewma_log;
+ est->intvl_log = intvl_log;
+ est->cpu_bstats = cpu_bstats;
+
+ if (lock)
+ local_bh_disable();
+ est_fetch_counters(est, &b);
+ if (lock)
+ local_bh_enable();
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
+
+ if (lock)
+ spin_lock_bh(lock);
+ old = rcu_dereference_protected(*rate_est, 1);
+ if (old) {
+ del_timer_sync(&old->timer);
+ est->avbps = old->avbps;
+ est->avpps = old->avpps;
+ }
+
+ est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
+ timer_setup(&est->timer, est_timer, 0);
+ mod_timer(&est->timer, est->next_jiffies);
+
+ rcu_assign_pointer(*rate_est, est);
+ if (lock)
+ spin_unlock_bh(lock);
+ if (old)
+ kfree_rcu(old, rcu);
+ return 0;
+}
+EXPORT_SYMBOL(gen_new_estimator);
+
+/**
+ * gen_kill_estimator - remove a rate estimator
+ * @rate_est: rate estimator
+ *
+ * Removes the rate estimator.
+ *
+ */
+void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
+{
+ struct net_rate_estimator *est;
+
+ est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
+ if (est) {
+ del_timer_sync(&est->timer);
+ kfree_rcu(est, rcu);
+ }
+}
+EXPORT_SYMBOL(gen_kill_estimator);
+
+/**
+ * gen_replace_estimator - replace rate estimator configuration
+ * @bstats: basic statistics
+ * @cpu_bstats: bstats per cpu
+ * @rate_est: rate estimator statistics
+ * @lock: lock for statistics and control path
+ * @running: qdisc running seqcount (might be NULL)
+ * @opt: rate estimator configuration TLV
+ *
+ * Replaces the configuration of a rate estimator by calling
+ * gen_kill_estimator() and gen_new_estimator().
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats,
+ struct net_rate_estimator __rcu **rate_est,
+ spinlock_t *lock,
+ seqcount_t *running, struct nlattr *opt)
+{
+ return gen_new_estimator(bstats, cpu_bstats, rate_est,
+ lock, running, opt);
+}
+EXPORT_SYMBOL(gen_replace_estimator);
+
+/**
+ * gen_estimator_active - test if estimator is currently in use
+ * @rate_est: rate estimator
+ *
+ * Returns true if estimator is active, and false if not.
+ */
+bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est)
+{
+ return !!rcu_access_pointer(*rate_est);
+}
+EXPORT_SYMBOL(gen_estimator_active);
+
+bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est,
+ struct gnet_stats_rate_est64 *sample)
+{
+ struct net_rate_estimator *est;
+ unsigned seq;
+
+ rcu_read_lock();
+ est = rcu_dereference(*rate_est);
+ if (!est) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ do {
+ seq = read_seqcount_begin(&est->seq);
+ sample->bps = est->avbps >> 8;
+ sample->pps = est->avpps >> 8;
+ } while (read_seqcount_retry(&est->seq, seq));
+
+ rcu_read_unlock();
+ return true;
+}
+EXPORT_SYMBOL(gen_estimator_read);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
new file mode 100644
index 000000000..e2fd8baec
--- /dev/null
+++ b/net/core/gen_stats.c
@@ -0,0 +1,396 @@
+/*
+ * net/core/gen_stats.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ * Jamal Hadi Salim
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * See Documentation/networking/gen_stats.txt
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/socket.h>
+#include <linux/rtnetlink.h>
+#include <linux/gen_stats.h>
+#include <net/netlink.h>
+#include <net/gen_stats.h>
+
+
+static inline int
+gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr)
+{
+ if (nla_put_64bit(d->skb, type, size, buf, padattr))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ if (d->lock)
+ spin_unlock_bh(d->lock);
+ kfree(d->xstats);
+ d->xstats = NULL;
+ d->xstats_len = 0;
+ return -1;
+}
+
+/**
+ * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
+ * @skb: socket buffer to put statistics TLVs into
+ * @type: TLV type for top level statistic TLV
+ * @tc_stats_type: TLV type for backward compatibility struct tc_stats TLV
+ * @xstats_type: TLV type for backward compatibility xstats TLV
+ * @lock: statistics lock
+ * @d: dumping handle
+ * @padattr: padding attribute
+ *
+ * Initializes the dumping handle, grabs the statistic lock and appends
+ * an empty TLV header to the socket buffer for use a container for all
+ * other statistic TLVS.
+ *
+ * The dumping handle is marked to be in backward compatibility mode telling
+ * all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats.
+ *
+ * Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
+ int xstats_type, spinlock_t *lock,
+ struct gnet_dump *d, int padattr)
+ __acquires(lock)
+{
+ memset(d, 0, sizeof(*d));
+
+ if (type)
+ d->tail = (struct nlattr *)skb_tail_pointer(skb);
+ d->skb = skb;
+ d->compat_tc_stats = tc_stats_type;
+ d->compat_xstats = xstats_type;
+ d->padattr = padattr;
+ if (lock) {
+ d->lock = lock;
+ spin_lock_bh(lock);
+ }
+ if (d->tail) {
+ int ret = gnet_stats_copy(d, type, NULL, 0, padattr);
+
+ /* The initial attribute added in gnet_stats_copy() may be
+ * preceded by a padding attribute, in which case d->tail will
+ * end up pointing at the padding instead of the real attribute.
+ * Fix this so gnet_stats_finish_copy() adjusts the length of
+ * the right attribute.
+ */
+ if (ret == 0 && d->tail->nla_type == padattr)
+ d->tail = (struct nlattr *)((char *)d->tail +
+ NLA_ALIGN(d->tail->nla_len));
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(gnet_stats_start_copy_compat);
+
+/**
+ * gnet_stats_start_copy - start dumping procedure in compatibility mode
+ * @skb: socket buffer to put statistics TLVs into
+ * @type: TLV type for top level statistic TLV
+ * @lock: statistics lock
+ * @d: dumping handle
+ * @padattr: padding attribute
+ *
+ * Initializes the dumping handle, grabs the statistic lock and appends
+ * an empty TLV header to the socket buffer for use a container for all
+ * other statistic TLVS.
+ *
+ * Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
+ struct gnet_dump *d, int padattr)
+{
+ return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d, padattr);
+}
+EXPORT_SYMBOL(gnet_stats_start_copy);
+
+static void
+__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i);
+ unsigned int start;
+ u64 bytes;
+ u32 packets;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&bcpu->syncp);
+ bytes = bcpu->bstats.bytes;
+ packets = bcpu->bstats.packets;
+ } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
+
+ bstats->bytes += bytes;
+ bstats->packets += packets;
+ }
+}
+
+void
+__gnet_stats_copy_basic(const seqcount_t *running,
+ struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu,
+ struct gnet_stats_basic_packed *b)
+{
+ unsigned int seq;
+
+ if (cpu) {
+ __gnet_stats_copy_basic_cpu(bstats, cpu);
+ return;
+ }
+ do {
+ if (running)
+ seq = read_seqcount_begin(running);
+ bstats->bytes = b->bytes;
+ bstats->packets = b->packets;
+ } while (running && read_seqcount_retry(running, seq));
+}
+EXPORT_SYMBOL(__gnet_stats_copy_basic);
+
+/**
+ * gnet_stats_copy_basic - copy basic statistics into statistic TLV
+ * @running: seqcount_t pointer
+ * @d: dumping handle
+ * @cpu: copy statistic per cpu
+ * @b: basic statistics
+ *
+ * Appends the basic statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_basic(const seqcount_t *running,
+ struct gnet_dump *d,
+ struct gnet_stats_basic_cpu __percpu *cpu,
+ struct gnet_stats_basic_packed *b)
+{
+ struct gnet_stats_basic_packed bstats = {0};
+
+ __gnet_stats_copy_basic(running, &bstats, cpu, b);
+
+ if (d->compat_tc_stats) {
+ d->tc_stats.bytes = bstats.bytes;
+ d->tc_stats.packets = bstats.packets;
+ }
+
+ if (d->tail) {
+ struct gnet_stats_basic sb;
+
+ memset(&sb, 0, sizeof(sb));
+ sb.bytes = bstats.bytes;
+ sb.packets = bstats.packets;
+ return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb),
+ TCA_STATS_PAD);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(gnet_stats_copy_basic);
+
+/**
+ * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
+ * @d: dumping handle
+ * @rate_est: rate estimator
+ *
+ * Appends the rate estimator statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_rate_est(struct gnet_dump *d,
+ struct net_rate_estimator __rcu **rate_est)
+{
+ struct gnet_stats_rate_est64 sample;
+ struct gnet_stats_rate_est est;
+ int res;
+
+ if (!gen_estimator_read(rate_est, &sample))
+ return 0;
+ est.bps = min_t(u64, UINT_MAX, sample.bps);
+ /* we have some time before reaching 2^32 packets per second */
+ est.pps = sample.pps;
+
+ if (d->compat_tc_stats) {
+ d->tc_stats.bps = est.bps;
+ d->tc_stats.pps = est.pps;
+ }
+
+ if (d->tail) {
+ res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
+ TCA_STATS_PAD);
+ if (res < 0 || est.bps == sample.bps)
+ return res;
+ /* emit 64bit stats only if needed */
+ return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample,
+ sizeof(sample), TCA_STATS_PAD);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(gnet_stats_copy_rate_est);
+
+static void
+__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
+ const struct gnet_stats_queue __percpu *q)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i);
+
+ qstats->backlog += qcpu->backlog;
+ qstats->drops += qcpu->drops;
+ qstats->requeues += qcpu->requeues;
+ qstats->overlimits += qcpu->overlimits;
+ }
+}
+
+void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
+ const struct gnet_stats_queue __percpu *cpu,
+ const struct gnet_stats_queue *q,
+ __u32 qlen)
+{
+ if (cpu) {
+ __gnet_stats_copy_queue_cpu(qstats, cpu);
+ } else {
+ qstats->backlog = q->backlog;
+ qstats->drops = q->drops;
+ qstats->requeues = q->requeues;
+ qstats->overlimits = q->overlimits;
+ }
+
+ qstats->qlen = qlen;
+}
+EXPORT_SYMBOL(__gnet_stats_copy_queue);
+
+/**
+ * gnet_stats_copy_queue - copy queue statistics into statistics TLV
+ * @d: dumping handle
+ * @cpu_q: per cpu queue statistics
+ * @q: queue statistics
+ * @qlen: queue length statistics
+ *
+ * Appends the queue statistics to the top level TLV created by
+ * gnet_stats_start_copy(). Using per cpu queue statistics if
+ * they are available.
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_queue(struct gnet_dump *d,
+ struct gnet_stats_queue __percpu *cpu_q,
+ struct gnet_stats_queue *q, __u32 qlen)
+{
+ struct gnet_stats_queue qstats = {0};
+
+ __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen);
+
+ if (d->compat_tc_stats) {
+ d->tc_stats.drops = qstats.drops;
+ d->tc_stats.qlen = qstats.qlen;
+ d->tc_stats.backlog = qstats.backlog;
+ d->tc_stats.overlimits = qstats.overlimits;
+ }
+
+ if (d->tail)
+ return gnet_stats_copy(d, TCA_STATS_QUEUE,
+ &qstats, sizeof(qstats),
+ TCA_STATS_PAD);
+
+ return 0;
+}
+EXPORT_SYMBOL(gnet_stats_copy_queue);
+
+/**
+ * gnet_stats_copy_app - copy application specific statistics into statistics TLV
+ * @d: dumping handle
+ * @st: application specific statistics data
+ * @len: length of data
+ *
+ * Appends the application specific statistics to the top level TLV created by
+ * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping
+ * handle is in backward compatibility mode.
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
+{
+ if (d->compat_xstats) {
+ d->xstats = kmemdup(st, len, GFP_ATOMIC);
+ if (!d->xstats)
+ goto err_out;
+ d->xstats_len = len;
+ }
+
+ if (d->tail)
+ return gnet_stats_copy(d, TCA_STATS_APP, st, len,
+ TCA_STATS_PAD);
+
+ return 0;
+
+err_out:
+ if (d->lock)
+ spin_unlock_bh(d->lock);
+ d->xstats_len = 0;
+ return -1;
+}
+EXPORT_SYMBOL(gnet_stats_copy_app);
+
+/**
+ * gnet_stats_finish_copy - finish dumping procedure
+ * @d: dumping handle
+ *
+ * Corrects the length of the top level TLV to include all TLVs added
+ * by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs
+ * if gnet_stats_start_copy_compat() was used and releases the statistics
+ * lock.
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_finish_copy(struct gnet_dump *d)
+{
+ if (d->tail)
+ d->tail->nla_len = skb_tail_pointer(d->skb) - (u8 *)d->tail;
+
+ if (d->compat_tc_stats)
+ if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats,
+ sizeof(d->tc_stats), d->padattr) < 0)
+ return -1;
+
+ if (d->compat_xstats && d->xstats) {
+ if (gnet_stats_copy(d, d->compat_xstats, d->xstats,
+ d->xstats_len, d->padattr) < 0)
+ return -1;
+ }
+
+ if (d->lock)
+ spin_unlock_bh(d->lock);
+ kfree(d->xstats);
+ d->xstats = NULL;
+ d->xstats_len = 0;
+ return 0;
+}
+EXPORT_SYMBOL(gnet_stats_finish_copy);
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
new file mode 100644
index 000000000..e095fb871
--- /dev/null
+++ b/net/core/gro_cells.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <net/gro_cells.h>
+
+struct gro_cell {
+ struct sk_buff_head napi_skbs;
+ struct napi_struct napi;
+};
+
+int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct gro_cell *cell;
+ int res;
+
+ rcu_read_lock();
+ if (unlikely(!(dev->flags & IFF_UP)))
+ goto drop;
+
+ if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) {
+ res = netif_rx(skb);
+ goto unlock;
+ }
+
+ cell = this_cpu_ptr(gcells->cells);
+
+ if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) {
+drop:
+ atomic_long_inc(&dev->rx_dropped);
+ kfree_skb(skb);
+ res = NET_RX_DROP;
+ goto unlock;
+ }
+
+ __skb_queue_tail(&cell->napi_skbs, skb);
+ if (skb_queue_len(&cell->napi_skbs) == 1)
+ napi_schedule(&cell->napi);
+
+ res = NET_RX_SUCCESS;
+
+unlock:
+ rcu_read_unlock();
+ return res;
+}
+EXPORT_SYMBOL(gro_cells_receive);
+
+/* called under BH context */
+static int gro_cell_poll(struct napi_struct *napi, int budget)
+{
+ struct gro_cell *cell = container_of(napi, struct gro_cell, napi);
+ struct sk_buff *skb;
+ int work_done = 0;
+
+ while (work_done < budget) {
+ skb = __skb_dequeue(&cell->napi_skbs);
+ if (!skb)
+ break;
+ napi_gro_receive(napi, skb);
+ work_done++;
+ }
+
+ if (work_done < budget)
+ napi_complete_done(napi, work_done);
+ return work_done;
+}
+
+int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
+{
+ int i;
+
+ gcells->cells = alloc_percpu(struct gro_cell);
+ if (!gcells->cells)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
+
+ __skb_queue_head_init(&cell->napi_skbs);
+
+ set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
+
+ netif_napi_add(dev, &cell->napi, gro_cell_poll,
+ NAPI_POLL_WEIGHT);
+ napi_enable(&cell->napi);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(gro_cells_init);
+
+void gro_cells_destroy(struct gro_cells *gcells)
+{
+ int i;
+
+ if (!gcells->cells)
+ return;
+ for_each_possible_cpu(i) {
+ struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
+
+ napi_disable(&cell->napi);
+ netif_napi_del(&cell->napi);
+ __skb_queue_purge(&cell->napi_skbs);
+ }
+ free_percpu(gcells->cells);
+ gcells->cells = NULL;
+}
+EXPORT_SYMBOL(gro_cells_destroy);
diff --git a/net/core/hwbm.c b/net/core/hwbm.c
new file mode 100644
index 000000000..2cab489ae
--- /dev/null
+++ b/net/core/hwbm.c
@@ -0,0 +1,90 @@
+/* Support for hardware buffer manager.
+ *
+ * Copyright (C) 2016 Marvell
+ *
+ * Gregory CLEMENT <gregory.clement@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/skbuff.h>
+#include <net/hwbm.h>
+
+void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf)
+{
+ if (likely(bm_pool->frag_size <= PAGE_SIZE))
+ skb_free_frag(buf);
+ else
+ kfree(buf);
+}
+EXPORT_SYMBOL_GPL(hwbm_buf_free);
+
+/* Refill processing for HW buffer management */
+int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp)
+{
+ int frag_size = bm_pool->frag_size;
+ void *buf;
+
+ if (likely(frag_size <= PAGE_SIZE))
+ buf = netdev_alloc_frag(frag_size);
+ else
+ buf = kmalloc(frag_size, gfp);
+
+ if (!buf)
+ return -ENOMEM;
+
+ if (bm_pool->construct)
+ if (bm_pool->construct(bm_pool, buf)) {
+ hwbm_buf_free(bm_pool, buf);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hwbm_pool_refill);
+
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
+{
+ int err, i;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bm_pool->lock, flags);
+ if (bm_pool->buf_num == bm_pool->size) {
+ pr_warn("pool already filled\n");
+ spin_unlock_irqrestore(&bm_pool->lock, flags);
+ return bm_pool->buf_num;
+ }
+
+ if (buf_num + bm_pool->buf_num > bm_pool->size) {
+ pr_warn("cannot allocate %d buffers for pool\n",
+ buf_num);
+ spin_unlock_irqrestore(&bm_pool->lock, flags);
+ return 0;
+ }
+
+ if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
+ pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
+ buf_num, bm_pool->buf_num);
+ spin_unlock_irqrestore(&bm_pool->lock, flags);
+ return 0;
+ }
+
+ for (i = 0; i < buf_num; i++) {
+ err = hwbm_pool_refill(bm_pool, gfp);
+ if (err < 0)
+ break;
+ }
+
+ /* Update BM driver with number of buffers added to pool */
+ bm_pool->buf_num += i;
+
+ pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num);
+ spin_unlock_irqrestore(&bm_pool->lock, flags);
+
+ return i;
+}
+EXPORT_SYMBOL_GPL(hwbm_pool_add);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
new file mode 100644
index 000000000..e38e641e9
--- /dev/null
+++ b/net/core/link_watch.c
@@ -0,0 +1,253 @@
+/*
+ * Linux network device link state notification
+ *
+ * Author:
+ * Stefan Rompf <sux@loplof.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <linux/rtnetlink.h>
+#include <linux/jiffies.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+
+
+enum lw_bits {
+ LW_URGENT = 0,
+};
+
+static unsigned long linkwatch_flags;
+static unsigned long linkwatch_nextevent;
+
+static void linkwatch_event(struct work_struct *dummy);
+static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
+
+static LIST_HEAD(lweventlist);
+static DEFINE_SPINLOCK(lweventlist_lock);
+
+static unsigned char default_operstate(const struct net_device *dev)
+{
+ if (!netif_carrier_ok(dev))
+ return (dev->ifindex != dev_get_iflink(dev) ?
+ IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN);
+
+ if (netif_dormant(dev))
+ return IF_OPER_DORMANT;
+
+ return IF_OPER_UP;
+}
+
+
+static void rfc2863_policy(struct net_device *dev)
+{
+ unsigned char operstate = default_operstate(dev);
+
+ if (operstate == dev->operstate)
+ return;
+
+ write_lock_bh(&dev_base_lock);
+
+ switch(dev->link_mode) {
+ case IF_LINK_MODE_DORMANT:
+ if (operstate == IF_OPER_UP)
+ operstate = IF_OPER_DORMANT;
+ break;
+
+ case IF_LINK_MODE_DEFAULT:
+ default:
+ break;
+ }
+
+ dev->operstate = operstate;
+
+ write_unlock_bh(&dev_base_lock);
+}
+
+
+void linkwatch_init_dev(struct net_device *dev)
+{
+ /* Handle pre-registration link state changes */
+ if (!netif_carrier_ok(dev) || netif_dormant(dev))
+ rfc2863_policy(dev);
+}
+
+
+static bool linkwatch_urgent_event(struct net_device *dev)
+{
+ if (!netif_running(dev))
+ return false;
+
+ if (dev->ifindex != dev_get_iflink(dev))
+ return true;
+
+ if (netif_is_lag_port(dev) || netif_is_lag_master(dev))
+ return true;
+
+ return netif_carrier_ok(dev) && qdisc_tx_changing(dev);
+}
+
+
+static void linkwatch_add_event(struct net_device *dev)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&lweventlist_lock, flags);
+ if (list_empty(&dev->link_watch_list)) {
+ list_add_tail(&dev->link_watch_list, &lweventlist);
+ dev_hold(dev);
+ }
+ spin_unlock_irqrestore(&lweventlist_lock, flags);
+}
+
+
+static void linkwatch_schedule_work(int urgent)
+{
+ unsigned long delay = linkwatch_nextevent - jiffies;
+
+ if (test_bit(LW_URGENT, &linkwatch_flags))
+ return;
+
+ /* Minimise down-time: drop delay for up event. */
+ if (urgent) {
+ if (test_and_set_bit(LW_URGENT, &linkwatch_flags))
+ return;
+ delay = 0;
+ }
+
+ /* If we wrap around we'll delay it by at most HZ. */
+ if (delay > HZ)
+ delay = 0;
+
+ /*
+ * If urgent, schedule immediate execution; otherwise, don't
+ * override the existing timer.
+ */
+ if (test_bit(LW_URGENT, &linkwatch_flags))
+ mod_delayed_work(system_wq, &linkwatch_work, 0);
+ else
+ schedule_delayed_work(&linkwatch_work, delay);
+}
+
+
+static void linkwatch_do_dev(struct net_device *dev)
+{
+ /*
+ * Make sure the above read is complete since it can be
+ * rewritten as soon as we clear the bit below.
+ */
+ smp_mb__before_atomic();
+
+ /* We are about to handle this device,
+ * so new events can be accepted
+ */
+ clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
+
+ rfc2863_policy(dev);
+ if (dev->flags & IFF_UP) {
+ if (netif_carrier_ok(dev))
+ dev_activate(dev);
+ else
+ dev_deactivate(dev);
+
+ netdev_state_change(dev);
+ }
+ dev_put(dev);
+}
+
+static void __linkwatch_run_queue(int urgent_only)
+{
+ struct net_device *dev;
+ LIST_HEAD(wrk);
+
+ /*
+ * Limit the number of linkwatch events to one
+ * per second so that a runaway driver does not
+ * cause a storm of messages on the netlink
+ * socket. This limit does not apply to up events
+ * while the device qdisc is down.
+ */
+ if (!urgent_only)
+ linkwatch_nextevent = jiffies + HZ;
+ /* Limit wrap-around effect on delay. */
+ else if (time_after(linkwatch_nextevent, jiffies + HZ))
+ linkwatch_nextevent = jiffies;
+
+ clear_bit(LW_URGENT, &linkwatch_flags);
+
+ spin_lock_irq(&lweventlist_lock);
+ list_splice_init(&lweventlist, &wrk);
+
+ while (!list_empty(&wrk)) {
+
+ dev = list_first_entry(&wrk, struct net_device, link_watch_list);
+ list_del_init(&dev->link_watch_list);
+
+ if (urgent_only && !linkwatch_urgent_event(dev)) {
+ list_add_tail(&dev->link_watch_list, &lweventlist);
+ continue;
+ }
+ spin_unlock_irq(&lweventlist_lock);
+ linkwatch_do_dev(dev);
+ spin_lock_irq(&lweventlist_lock);
+ }
+
+ if (!list_empty(&lweventlist))
+ linkwatch_schedule_work(0);
+ spin_unlock_irq(&lweventlist_lock);
+}
+
+void linkwatch_forget_dev(struct net_device *dev)
+{
+ unsigned long flags;
+ int clean = 0;
+
+ spin_lock_irqsave(&lweventlist_lock, flags);
+ if (!list_empty(&dev->link_watch_list)) {
+ list_del_init(&dev->link_watch_list);
+ clean = 1;
+ }
+ spin_unlock_irqrestore(&lweventlist_lock, flags);
+ if (clean)
+ linkwatch_do_dev(dev);
+}
+
+
+/* Must be called with the rtnl semaphore held */
+void linkwatch_run_queue(void)
+{
+ __linkwatch_run_queue(0);
+}
+
+
+static void linkwatch_event(struct work_struct *dummy)
+{
+ rtnl_lock();
+ __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies));
+ rtnl_unlock();
+}
+
+
+void linkwatch_fire_event(struct net_device *dev)
+{
+ bool urgent = linkwatch_urgent_event(dev);
+
+ if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
+ linkwatch_add_event(dev);
+ } else if (!urgent)
+ return;
+
+ linkwatch_schedule_work(urgent);
+}
+EXPORT_SYMBOL(linkwatch_fire_event);
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
new file mode 100644
index 000000000..4a5f4fbff
--- /dev/null
+++ b/net/core/lwt_bpf.c
@@ -0,0 +1,398 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <net/lwtunnel.h>
+
+struct bpf_lwt_prog {
+ struct bpf_prog *prog;
+ char *name;
+};
+
+struct bpf_lwt {
+ struct bpf_lwt_prog in;
+ struct bpf_lwt_prog out;
+ struct bpf_lwt_prog xmit;
+ int family;
+};
+
+#define MAX_PROG_NAME 256
+
+static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct bpf_lwt *)lwt->data;
+}
+
+#define NO_REDIRECT false
+#define CAN_REDIRECT true
+
+static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
+ struct dst_entry *dst, bool can_redirect)
+{
+ int ret;
+
+ /* Preempt disable and BH disable are needed to protect per-cpu
+ * redirect_info between BPF prog and skb_do_redirect().
+ */
+ preempt_disable();
+ local_bh_disable();
+ bpf_compute_data_pointers(skb);
+ ret = bpf_prog_run_save_cb(lwt->prog, skb);
+
+ switch (ret) {
+ case BPF_OK:
+ break;
+
+ case BPF_REDIRECT:
+ if (unlikely(!can_redirect)) {
+ pr_warn_once("Illegal redirect return code in prog %s\n",
+ lwt->name ? : "<unknown>");
+ ret = BPF_OK;
+ } else {
+ skb_reset_mac_header(skb);
+ ret = skb_do_redirect(skb);
+ if (ret == 0)
+ ret = BPF_REDIRECT;
+ }
+ break;
+
+ case BPF_DROP:
+ kfree_skb(skb);
+ ret = -EPERM;
+ break;
+
+ default:
+ pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret);
+ kfree_skb(skb);
+ ret = -EINVAL;
+ break;
+ }
+
+ local_bh_enable();
+ preempt_enable();
+
+ return ret;
+}
+
+static int bpf_input(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->in.prog) {
+ ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (unlikely(!dst->lwtstate->orig_input)) {
+ pr_warn_once("orig_input not set on dst for prog %s\n",
+ bpf->out.name);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_input(skb);
+}
+
+static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->out.prog) {
+ ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (unlikely(!dst->lwtstate->orig_output)) {
+ pr_warn_once("orig_output not set on dst for prog %s\n",
+ bpf->out.name);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_output(net, sk, skb);
+}
+
+static int xmit_check_hhlen(struct sk_buff *skb)
+{
+ int hh_len = skb_dst(skb)->dev->hard_header_len;
+
+ if (skb_headroom(skb) < hh_len) {
+ int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+ if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int bpf_xmit(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->xmit.prog) {
+ int ret;
+
+ ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
+ switch (ret) {
+ case BPF_OK:
+ /* If the header was expanded, headroom might be too
+ * small for L2 header to come, expand as needed.
+ */
+ ret = xmit_check_hhlen(skb);
+ if (unlikely(ret))
+ return ret;
+
+ return LWTUNNEL_XMIT_CONTINUE;
+ case BPF_REDIRECT:
+ return LWTUNNEL_XMIT_DONE;
+ default:
+ return ret;
+ }
+ }
+
+ return LWTUNNEL_XMIT_CONTINUE;
+}
+
+static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
+{
+ if (prog->prog)
+ bpf_prog_put(prog->prog);
+
+ kfree(prog->name);
+}
+
+static void bpf_destroy_state(struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ bpf_lwt_prog_destroy(&bpf->in);
+ bpf_lwt_prog_destroy(&bpf->out);
+ bpf_lwt_prog_destroy(&bpf->xmit);
+}
+
+static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
+ [LWT_BPF_PROG_FD] = { .type = NLA_U32, },
+ [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+ .len = MAX_PROG_NAME },
+};
+
+static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
+ enum bpf_prog_type type)
+{
+ struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
+ struct bpf_prog *p;
+ int ret;
+ u32 fd;
+
+ ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy,
+ NULL);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
+ return -EINVAL;
+
+ prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC);
+ if (!prog->name)
+ return -ENOMEM;
+
+ fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
+ p = bpf_prog_get_type(fd, type);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ prog->prog = p;
+
+ return 0;
+}
+
+static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
+ [LWT_BPF_IN] = { .type = NLA_NESTED, },
+ [LWT_BPF_OUT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
+};
+
+static int bpf_build_state(struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWT_BPF_MAX + 1];
+ struct lwtunnel_state *newts;
+ struct bpf_lwt *bpf;
+ int ret;
+
+ if (family != AF_INET && family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(sizeof(*bpf));
+ if (!newts)
+ return -ENOMEM;
+
+ newts->type = LWTUNNEL_ENCAP_BPF;
+ bpf = bpf_lwt_lwtunnel(newts);
+
+ if (tb[LWT_BPF_IN]) {
+ newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
+ BPF_PROG_TYPE_LWT_IN);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_OUT]) {
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
+ BPF_PROG_TYPE_LWT_OUT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT]) {
+ newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
+ BPF_PROG_TYPE_LWT_XMIT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT_HEADROOM]) {
+ u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]);
+
+ if (headroom > LWT_BPF_MAX_HEADROOM) {
+ ret = -ERANGE;
+ goto errout;
+ }
+
+ newts->headroom = headroom;
+ }
+
+ bpf->family = family;
+ *ts = newts;
+
+ return 0;
+
+errout:
+ bpf_destroy_state(newts);
+ kfree(newts);
+ return ret;
+}
+
+static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
+ struct bpf_lwt_prog *prog)
+{
+ struct nlattr *nest;
+
+ if (!prog->prog)
+ return 0;
+
+ nest = nla_nest_start(skb, attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (prog->name &&
+ nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
+ return -EMSGSIZE;
+
+ return nla_nest_end(skb, nest);
+}
+
+static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ int nest_len = nla_total_size(sizeof(struct nlattr)) +
+ nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
+ 0;
+
+ return nest_len + /* LWT_BPF_IN */
+ nest_len + /* LWT_BPF_OUT */
+ nest_len + /* LWT_BPF_XMIT */
+ 0;
+}
+
+static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
+{
+ /* FIXME:
+ * The LWT state is currently rebuilt for delete requests which
+ * results in a new bpf_prog instance. Comparing names for now.
+ */
+ if (!a->name && !b->name)
+ return 0;
+
+ if (!a->name || !b->name)
+ return 1;
+
+ return strcmp(a->name, b->name);
+}
+
+static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
+ struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
+
+ return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
+ bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
+ bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
+}
+
+static const struct lwtunnel_encap_ops bpf_encap_ops = {
+ .build_state = bpf_build_state,
+ .destroy_state = bpf_destroy_state,
+ .input = bpf_input,
+ .output = bpf_output,
+ .xmit = bpf_xmit,
+ .fill_encap = bpf_fill_encap_info,
+ .get_encap_size = bpf_encap_nlsize,
+ .cmp_encap = bpf_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+static int __init bpf_lwt_init(void)
+{
+ return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
+}
+
+subsys_initcall(bpf_lwt_init)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
new file mode 100644
index 000000000..0b1717564
--- /dev/null
+++ b/net/core/lwtunnel.c
@@ -0,0 +1,417 @@
+/*
+ * lwtunnel Infrastructure for light weight tunnels like mpls
+ *
+ * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/lwtunnel.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/err.h>
+
+#include <net/lwtunnel.h>
+#include <net/rtnetlink.h>
+#include <net/ip6_fib.h>
+#include <net/nexthop.h>
+
+#ifdef CONFIG_MODULES
+
+static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
+{
+ /* Only lwt encaps implemented without using an interface for
+ * the encap need to return a string here.
+ */
+ switch (encap_type) {
+ case LWTUNNEL_ENCAP_MPLS:
+ return "MPLS";
+ case LWTUNNEL_ENCAP_ILA:
+ return "ILA";
+ case LWTUNNEL_ENCAP_SEG6:
+ return "SEG6";
+ case LWTUNNEL_ENCAP_BPF:
+ return "BPF";
+ case LWTUNNEL_ENCAP_SEG6_LOCAL:
+ return "SEG6LOCAL";
+ case LWTUNNEL_ENCAP_IP6:
+ case LWTUNNEL_ENCAP_IP:
+ case LWTUNNEL_ENCAP_NONE:
+ case __LWTUNNEL_ENCAP_MAX:
+ /* should not have got here */
+ WARN_ON(1);
+ break;
+ }
+ return NULL;
+}
+
+#endif /* CONFIG_MODULES */
+
+struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
+{
+ struct lwtunnel_state *lws;
+
+ lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC);
+
+ return lws;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_state_alloc);
+
+static const struct lwtunnel_encap_ops __rcu *
+ lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly;
+
+int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops,
+ unsigned int num)
+{
+ if (num > LWTUNNEL_ENCAP_MAX)
+ return -ERANGE;
+
+ return !cmpxchg((const struct lwtunnel_encap_ops **)
+ &lwtun_encaps[num],
+ NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_encap_add_ops);
+
+int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
+ unsigned int encap_type)
+{
+ int ret;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE ||
+ encap_type > LWTUNNEL_ENCAP_MAX)
+ return -ERANGE;
+
+ ret = (cmpxchg((const struct lwtunnel_encap_ops **)
+ &lwtun_encaps[encap_type],
+ ops, NULL) == ops) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops);
+
+int lwtunnel_build_state(u16 encap_type,
+ struct nlattr *encap, unsigned int family,
+ const void *cfg, struct lwtunnel_state **lws,
+ struct netlink_ext_ack *extack)
+{
+ const struct lwtunnel_encap_ops *ops;
+ bool found = false;
+ int ret = -EINVAL;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE ||
+ encap_type > LWTUNNEL_ENCAP_MAX) {
+ NL_SET_ERR_MSG_ATTR(extack, encap,
+ "Unknown LWT encapsulation type");
+ return ret;
+ }
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[encap_type]);
+ if (likely(ops && ops->build_state && try_module_get(ops->owner))) {
+ found = true;
+ ret = ops->build_state(encap, family, cfg, lws, extack);
+ if (ret)
+ module_put(ops->owner);
+ }
+ rcu_read_unlock();
+
+ /* don't rely on -EOPNOTSUPP to detect match as build_state
+ * handlers could return it
+ */
+ if (!found) {
+ NL_SET_ERR_MSG_ATTR(extack, encap,
+ "LWT encapsulation type not supported");
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_build_state);
+
+int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = -EINVAL;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE ||
+ encap_type > LWTUNNEL_ENCAP_MAX) {
+ NL_SET_ERR_MSG(extack, "Unknown lwt encapsulation type");
+ return ret;
+ }
+
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[encap_type]);
+ rcu_read_unlock();
+#ifdef CONFIG_MODULES
+ if (!ops) {
+ const char *encap_type_str = lwtunnel_encap_str(encap_type);
+
+ if (encap_type_str) {
+ __rtnl_unlock();
+ request_module("rtnl-lwt-%s", encap_type_str);
+ rtnl_lock();
+
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[encap_type]);
+ rcu_read_unlock();
+ }
+ }
+#endif
+ ret = ops ? 0 : -EOPNOTSUPP;
+ if (ret < 0)
+ NL_SET_ERR_MSG(extack, "lwt encapsulation type not supported");
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type);
+
+int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining,
+ struct netlink_ext_ack *extack)
+{
+ struct rtnexthop *rtnh = (struct rtnexthop *)attr;
+ struct nlattr *nla_entype;
+ struct nlattr *attrs;
+ u16 encap_type;
+ int attrlen;
+
+ while (rtnh_ok(rtnh, remaining)) {
+ attrlen = rtnh_attrlen(rtnh);
+ if (attrlen > 0) {
+ attrs = rtnh_attrs(rtnh);
+ nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+
+ if (nla_entype) {
+ encap_type = nla_get_u16(nla_entype);
+
+ if (lwtunnel_valid_encap_type(encap_type,
+ extack) != 0)
+ return -EOPNOTSUPP;
+ }
+ }
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type_attr);
+
+void lwtstate_free(struct lwtunnel_state *lws)
+{
+ const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type];
+
+ if (ops->destroy_state) {
+ ops->destroy_state(lws);
+ kfree_rcu(lws, rcu);
+ } else {
+ kfree(lws);
+ }
+ module_put(ops->owner);
+}
+EXPORT_SYMBOL_GPL(lwtstate_free);
+
+int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
+{
+ const struct lwtunnel_encap_ops *ops;
+ struct nlattr *nest;
+ int ret;
+
+ if (!lwtstate)
+ return 0;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ nest = nla_nest_start(skb, RTA_ENCAP);
+ if (!nest)
+ return -EMSGSIZE;
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->fill_encap))
+ ret = ops->fill_encap(skb, lwtstate);
+ rcu_read_unlock();
+
+ if (ret)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type);
+ if (ret)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+
+ return (ret == -EOPNOTSUPP ? 0 : ret);
+}
+EXPORT_SYMBOL_GPL(lwtunnel_fill_encap);
+
+int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = 0;
+
+ if (!lwtstate)
+ return 0;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->get_encap_size))
+ ret = nla_total_size(ops->get_encap_size(lwtstate));
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_get_encap_size);
+
+int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = 0;
+
+ if (!a && !b)
+ return 0;
+
+ if (!a || !b)
+ return 1;
+
+ if (a->type != b->type)
+ return 1;
+
+ if (a->type == LWTUNNEL_ENCAP_NONE ||
+ a->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[a->type]);
+ if (likely(ops && ops->cmp_encap))
+ ret = ops->cmp_encap(a, b);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap);
+
+int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ const struct lwtunnel_encap_ops *ops;
+ struct lwtunnel_state *lwtstate;
+ int ret = -EINVAL;
+
+ if (!dst)
+ goto drop;
+ lwtstate = dst->lwtstate;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->output))
+ ret = ops->output(net, sk, skb);
+ rcu_read_unlock();
+
+ if (ret == -EOPNOTSUPP)
+ goto drop;
+
+ return ret;
+
+drop:
+ kfree_skb(skb);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_output);
+
+int lwtunnel_xmit(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ const struct lwtunnel_encap_ops *ops;
+ struct lwtunnel_state *lwtstate;
+ int ret = -EINVAL;
+
+ if (!dst)
+ goto drop;
+
+ lwtstate = dst->lwtstate;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->xmit))
+ ret = ops->xmit(skb);
+ rcu_read_unlock();
+
+ if (ret == -EOPNOTSUPP)
+ goto drop;
+
+ return ret;
+
+drop:
+ kfree_skb(skb);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_xmit);
+
+int lwtunnel_input(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ const struct lwtunnel_encap_ops *ops;
+ struct lwtunnel_state *lwtstate;
+ int ret = -EINVAL;
+
+ if (!dst)
+ goto drop;
+ lwtstate = dst->lwtstate;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->input))
+ ret = ops->input(skb);
+ rcu_read_unlock();
+
+ if (ret == -EOPNOTSUPP)
+ goto drop;
+
+ return ret;
+
+drop:
+ kfree_skb(skb);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_input);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
new file mode 100644
index 000000000..6233e9856
--- /dev/null
+++ b/net/core/neighbour.c
@@ -0,0 +1,3300 @@
+/*
+ * Generic address resolution entity
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Vitaly E. Lavrov releasing NULL neighbor in neigh_add.
+ * Harald Welte Add neighbour cache statistics like rtstat
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/kmemleak.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <linux/times.h>
+#include <net/net_namespace.h>
+#include <net/neighbour.h>
+#include <net/arp.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/netevent.h>
+#include <net/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/log2.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+
+#define DEBUG
+#define NEIGH_DEBUG 1
+#define neigh_dbg(level, fmt, ...) \
+do { \
+ if (level <= NEIGH_DEBUG) \
+ pr_debug(fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define PNEIGH_HASHMASK 0xF
+
+static void neigh_timer_handler(struct timer_list *t);
+static void __neigh_notify(struct neighbour *n, int type, int flags,
+ u32 pid);
+static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
+static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
+ struct net_device *dev);
+
+#ifdef CONFIG_PROC_FS
+static const struct seq_operations neigh_stat_seq_ops;
+#endif
+
+/*
+ Neighbour hash table buckets are protected with rwlock tbl->lock.
+
+ - All the scans/updates to hash buckets MUST be made under this lock.
+ - NOTHING clever should be made under this lock: no callbacks
+ to protocol backends, no attempts to send something to network.
+ It will result in deadlocks, if backend/driver wants to use neighbour
+ cache.
+ - If the entry requires some non-trivial actions, increase
+ its reference count and release table lock.
+
+ Neighbour entries are protected:
+ - with reference count.
+ - with rwlock neigh->lock
+
+ Reference count prevents destruction.
+
+ neigh->lock mainly serializes ll address data and its validity state.
+ However, the same lock is used to protect another entry fields:
+ - timer
+ - resolution queue
+
+ Again, nothing clever shall be made under neigh->lock,
+ the most complicated procedure, which we allow is dev->hard_header.
+ It is supposed, that dev->hard_header is simplistic and does
+ not make callbacks to neighbour tables.
+ */
+
+static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ return -ENETDOWN;
+}
+
+static void neigh_cleanup_and_release(struct neighbour *neigh)
+{
+ if (neigh->parms->neigh_cleanup)
+ neigh->parms->neigh_cleanup(neigh);
+
+ __neigh_notify(neigh, RTM_DELNEIGH, 0, 0);
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
+ neigh_release(neigh);
+}
+
+/*
+ * It is random distribution in the interval (1/2)*base...(3/2)*base.
+ * It corresponds to default IPv6 settings and is not overridable,
+ * because it is really reasonable choice.
+ */
+
+unsigned long neigh_rand_reach_time(unsigned long base)
+{
+ return base ? (prandom_u32() % base) + (base >> 1) : 0;
+}
+EXPORT_SYMBOL(neigh_rand_reach_time);
+
+
+static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags,
+ struct neighbour __rcu **np, struct neigh_table *tbl)
+{
+ bool retval = false;
+
+ write_lock(&n->lock);
+ if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state) &&
+ !(n->flags & flags)) {
+ struct neighbour *neigh;
+
+ neigh = rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock));
+ rcu_assign_pointer(*np, neigh);
+ n->dead = 1;
+ retval = true;
+ }
+ write_unlock(&n->lock);
+ if (retval)
+ neigh_cleanup_and_release(n);
+ return retval;
+}
+
+bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)
+{
+ struct neigh_hash_table *nht;
+ void *pkey = ndel->primary_key;
+ u32 hash_val;
+ struct neighbour *n;
+ struct neighbour __rcu **np;
+
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd);
+ hash_val = hash_val >> (32 - nht->hash_shift);
+
+ np = &nht->hash_buckets[hash_val];
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock)))) {
+ if (n == ndel)
+ return neigh_del(n, 0, 0, np, tbl);
+ np = &n->next;
+ }
+ return false;
+}
+
+static int neigh_forced_gc(struct neigh_table *tbl)
+{
+ int shrunk = 0;
+ int i;
+ struct neigh_hash_table *nht;
+
+ NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
+
+ write_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ for (i = 0; i < (1 << nht->hash_shift); i++) {
+ struct neighbour *n;
+ struct neighbour __rcu **np;
+
+ np = &nht->hash_buckets[i];
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
+ /* Neighbour record may be discarded if:
+ * - nobody refers to it.
+ * - it is not permanent
+ */
+ if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np,
+ tbl)) {
+ shrunk = 1;
+ continue;
+ }
+ np = &n->next;
+ }
+ }
+
+ tbl->last_flush = jiffies;
+
+ write_unlock_bh(&tbl->lock);
+
+ return shrunk;
+}
+
+static void neigh_add_timer(struct neighbour *n, unsigned long when)
+{
+ neigh_hold(n);
+ if (unlikely(mod_timer(&n->timer, when))) {
+ printk("NEIGH: BUG, double timer add, state is %x\n",
+ n->nud_state);
+ dump_stack();
+ }
+}
+
+static int neigh_del_timer(struct neighbour *n)
+{
+ if ((n->nud_state & NUD_IN_TIMER) &&
+ del_timer(&n->timer)) {
+ neigh_release(n);
+ return 1;
+ }
+ return 0;
+}
+
+static void pneigh_queue_purge(struct sk_buff_head *list)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(list)) != NULL) {
+ dev_put(skb->dev);
+ kfree_skb(skb);
+ }
+}
+
+static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
+{
+ int i;
+ struct neigh_hash_table *nht;
+
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+
+ for (i = 0; i < (1 << nht->hash_shift); i++) {
+ struct neighbour *n;
+ struct neighbour __rcu **np = &nht->hash_buckets[i];
+
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
+ if (dev && n->dev != dev) {
+ np = &n->next;
+ continue;
+ }
+ rcu_assign_pointer(*np,
+ rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock)));
+ write_lock(&n->lock);
+ neigh_del_timer(n);
+ n->dead = 1;
+
+ if (refcount_read(&n->refcnt) != 1) {
+ /* The most unpleasant situation.
+ We must destroy neighbour entry,
+ but someone still uses it.
+
+ The destroy will be delayed until
+ the last user releases us, but
+ we must kill timers etc. and move
+ it to safe state.
+ */
+ __skb_queue_purge(&n->arp_queue);
+ n->arp_queue_len_bytes = 0;
+ n->output = neigh_blackhole;
+ if (n->nud_state & NUD_VALID)
+ n->nud_state = NUD_NOARP;
+ else
+ n->nud_state = NUD_NONE;
+ neigh_dbg(2, "neigh %p is stray\n", n);
+ }
+ write_unlock(&n->lock);
+ neigh_cleanup_and_release(n);
+ }
+ }
+}
+
+void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
+{
+ write_lock_bh(&tbl->lock);
+ neigh_flush_dev(tbl, dev);
+ write_unlock_bh(&tbl->lock);
+}
+EXPORT_SYMBOL(neigh_changeaddr);
+
+int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+{
+ write_lock_bh(&tbl->lock);
+ neigh_flush_dev(tbl, dev);
+ pneigh_ifdown_and_unlock(tbl, dev);
+
+ del_timer_sync(&tbl->proxy_timer);
+ pneigh_queue_purge(&tbl->proxy_queue);
+ return 0;
+}
+EXPORT_SYMBOL(neigh_ifdown);
+
+static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
+{
+ struct neighbour *n = NULL;
+ unsigned long now = jiffies;
+ int entries;
+
+ entries = atomic_inc_return(&tbl->entries) - 1;
+ if (entries >= tbl->gc_thresh3 ||
+ (entries >= tbl->gc_thresh2 &&
+ time_after(now, tbl->last_flush + 5 * HZ))) {
+ if (!neigh_forced_gc(tbl) &&
+ entries >= tbl->gc_thresh3) {
+ net_info_ratelimited("%s: neighbor table overflow!\n",
+ tbl->id);
+ NEIGH_CACHE_STAT_INC(tbl, table_fulls);
+ goto out_entries;
+ }
+ }
+
+ n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
+ if (!n)
+ goto out_entries;
+
+ __skb_queue_head_init(&n->arp_queue);
+ rwlock_init(&n->lock);
+ seqlock_init(&n->ha_lock);
+ n->updated = n->used = now;
+ n->nud_state = NUD_NONE;
+ n->output = neigh_blackhole;
+ seqlock_init(&n->hh.hh_lock);
+ n->parms = neigh_parms_clone(&tbl->parms);
+ timer_setup(&n->timer, neigh_timer_handler, 0);
+
+ NEIGH_CACHE_STAT_INC(tbl, allocs);
+ n->tbl = tbl;
+ refcount_set(&n->refcnt, 1);
+ n->dead = 1;
+out:
+ return n;
+
+out_entries:
+ atomic_dec(&tbl->entries);
+ goto out;
+}
+
+static void neigh_get_hash_rnd(u32 *x)
+{
+ *x = get_random_u32() | 1;
+}
+
+static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
+{
+ size_t size = (1 << shift) * sizeof(struct neighbour *);
+ struct neigh_hash_table *ret;
+ struct neighbour __rcu **buckets;
+ int i;
+
+ ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
+ if (!ret)
+ return NULL;
+ if (size <= PAGE_SIZE) {
+ buckets = kzalloc(size, GFP_ATOMIC);
+ } else {
+ buckets = (struct neighbour __rcu **)
+ __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
+ get_order(size));
+ kmemleak_alloc(buckets, size, 1, GFP_ATOMIC);
+ }
+ if (!buckets) {
+ kfree(ret);
+ return NULL;
+ }
+ ret->hash_buckets = buckets;
+ ret->hash_shift = shift;
+ for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
+ neigh_get_hash_rnd(&ret->hash_rnd[i]);
+ return ret;
+}
+
+static void neigh_hash_free_rcu(struct rcu_head *head)
+{
+ struct neigh_hash_table *nht = container_of(head,
+ struct neigh_hash_table,
+ rcu);
+ size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *);
+ struct neighbour __rcu **buckets = nht->hash_buckets;
+
+ if (size <= PAGE_SIZE) {
+ kfree(buckets);
+ } else {
+ kmemleak_free(buckets);
+ free_pages((unsigned long)buckets, get_order(size));
+ }
+ kfree(nht);
+}
+
+static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
+ unsigned long new_shift)
+{
+ unsigned int i, hash;
+ struct neigh_hash_table *new_nht, *old_nht;
+
+ NEIGH_CACHE_STAT_INC(tbl, hash_grows);
+
+ old_nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ new_nht = neigh_hash_alloc(new_shift);
+ if (!new_nht)
+ return old_nht;
+
+ for (i = 0; i < (1 << old_nht->hash_shift); i++) {
+ struct neighbour *n, *next;
+
+ for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
+ lockdep_is_held(&tbl->lock));
+ n != NULL;
+ n = next) {
+ hash = tbl->hash(n->primary_key, n->dev,
+ new_nht->hash_rnd);
+
+ hash >>= (32 - new_nht->hash_shift);
+ next = rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock));
+
+ rcu_assign_pointer(n->next,
+ rcu_dereference_protected(
+ new_nht->hash_buckets[hash],
+ lockdep_is_held(&tbl->lock)));
+ rcu_assign_pointer(new_nht->hash_buckets[hash], n);
+ }
+ }
+
+ rcu_assign_pointer(tbl->nht, new_nht);
+ call_rcu(&old_nht->rcu, neigh_hash_free_rcu);
+ return new_nht;
+}
+
+struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
+ struct net_device *dev)
+{
+ struct neighbour *n;
+
+ NEIGH_CACHE_STAT_INC(tbl, lookups);
+
+ rcu_read_lock_bh();
+ n = __neigh_lookup_noref(tbl, pkey, dev);
+ if (n) {
+ if (!refcount_inc_not_zero(&n->refcnt))
+ n = NULL;
+ NEIGH_CACHE_STAT_INC(tbl, hits);
+ }
+
+ rcu_read_unlock_bh();
+ return n;
+}
+EXPORT_SYMBOL(neigh_lookup);
+
+struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
+ const void *pkey)
+{
+ struct neighbour *n;
+ unsigned int key_len = tbl->key_len;
+ u32 hash_val;
+ struct neigh_hash_table *nht;
+
+ NEIGH_CACHE_STAT_INC(tbl, lookups);
+
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+ hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift);
+
+ for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
+ n != NULL;
+ n = rcu_dereference_bh(n->next)) {
+ if (!memcmp(n->primary_key, pkey, key_len) &&
+ net_eq(dev_net(n->dev), net)) {
+ if (!refcount_inc_not_zero(&n->refcnt))
+ n = NULL;
+ NEIGH_CACHE_STAT_INC(tbl, hits);
+ break;
+ }
+ }
+
+ rcu_read_unlock_bh();
+ return n;
+}
+EXPORT_SYMBOL(neigh_lookup_nodev);
+
+struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
+ struct net_device *dev, bool want_ref)
+{
+ u32 hash_val;
+ unsigned int key_len = tbl->key_len;
+ int error;
+ struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
+ struct neigh_hash_table *nht;
+
+ if (!n) {
+ rc = ERR_PTR(-ENOBUFS);
+ goto out;
+ }
+
+ memcpy(n->primary_key, pkey, key_len);
+ n->dev = dev;
+ dev_hold(dev);
+
+ /* Protocol specific setup. */
+ if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
+ rc = ERR_PTR(error);
+ goto out_neigh_release;
+ }
+
+ if (dev->netdev_ops->ndo_neigh_construct) {
+ error = dev->netdev_ops->ndo_neigh_construct(dev, n);
+ if (error < 0) {
+ rc = ERR_PTR(error);
+ goto out_neigh_release;
+ }
+ }
+
+ /* Device specific setup. */
+ if (n->parms->neigh_setup &&
+ (error = n->parms->neigh_setup(n)) < 0) {
+ rc = ERR_PTR(error);
+ goto out_neigh_release;
+ }
+
+ n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);
+
+ write_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+
+ if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
+ nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
+
+ hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
+
+ if (n->parms->dead) {
+ rc = ERR_PTR(-EINVAL);
+ goto out_tbl_unlock;
+ }
+
+ for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
+ lockdep_is_held(&tbl->lock));
+ n1 != NULL;
+ n1 = rcu_dereference_protected(n1->next,
+ lockdep_is_held(&tbl->lock))) {
+ if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) {
+ if (want_ref)
+ neigh_hold(n1);
+ rc = n1;
+ goto out_tbl_unlock;
+ }
+ }
+
+ n->dead = 0;
+ if (want_ref)
+ neigh_hold(n);
+ rcu_assign_pointer(n->next,
+ rcu_dereference_protected(nht->hash_buckets[hash_val],
+ lockdep_is_held(&tbl->lock)));
+ rcu_assign_pointer(nht->hash_buckets[hash_val], n);
+ write_unlock_bh(&tbl->lock);
+ neigh_dbg(2, "neigh %p is created\n", n);
+ rc = n;
+out:
+ return rc;
+out_tbl_unlock:
+ write_unlock_bh(&tbl->lock);
+out_neigh_release:
+ neigh_release(n);
+ goto out;
+}
+EXPORT_SYMBOL(__neigh_create);
+
+static u32 pneigh_hash(const void *pkey, unsigned int key_len)
+{
+ u32 hash_val = *(u32 *)(pkey + key_len - 4);
+ hash_val ^= (hash_val >> 16);
+ hash_val ^= hash_val >> 8;
+ hash_val ^= hash_val >> 4;
+ hash_val &= PNEIGH_HASHMASK;
+ return hash_val;
+}
+
+static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n,
+ struct net *net,
+ const void *pkey,
+ unsigned int key_len,
+ struct net_device *dev)
+{
+ while (n) {
+ if (!memcmp(n->key, pkey, key_len) &&
+ net_eq(pneigh_net(n), net) &&
+ (n->dev == dev || !n->dev))
+ return n;
+ n = n->next;
+ }
+ return NULL;
+}
+
+struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl,
+ struct net *net, const void *pkey, struct net_device *dev)
+{
+ unsigned int key_len = tbl->key_len;
+ u32 hash_val = pneigh_hash(pkey, key_len);
+
+ return __pneigh_lookup_1(tbl->phash_buckets[hash_val],
+ net, pkey, key_len, dev);
+}
+EXPORT_SYMBOL_GPL(__pneigh_lookup);
+
+struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
+ struct net *net, const void *pkey,
+ struct net_device *dev, int creat)
+{
+ struct pneigh_entry *n;
+ unsigned int key_len = tbl->key_len;
+ u32 hash_val = pneigh_hash(pkey, key_len);
+
+ read_lock_bh(&tbl->lock);
+ n = __pneigh_lookup_1(tbl->phash_buckets[hash_val],
+ net, pkey, key_len, dev);
+ read_unlock_bh(&tbl->lock);
+
+ if (n || !creat)
+ goto out;
+
+ ASSERT_RTNL();
+
+ n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL);
+ if (!n)
+ goto out;
+
+ write_pnet(&n->net, net);
+ memcpy(n->key, pkey, key_len);
+ n->dev = dev;
+ if (dev)
+ dev_hold(dev);
+
+ if (tbl->pconstructor && tbl->pconstructor(n)) {
+ if (dev)
+ dev_put(dev);
+ kfree(n);
+ n = NULL;
+ goto out;
+ }
+
+ write_lock_bh(&tbl->lock);
+ n->next = tbl->phash_buckets[hash_val];
+ tbl->phash_buckets[hash_val] = n;
+ write_unlock_bh(&tbl->lock);
+out:
+ return n;
+}
+EXPORT_SYMBOL(pneigh_lookup);
+
+
+int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
+ struct net_device *dev)
+{
+ struct pneigh_entry *n, **np;
+ unsigned int key_len = tbl->key_len;
+ u32 hash_val = pneigh_hash(pkey, key_len);
+
+ write_lock_bh(&tbl->lock);
+ for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL;
+ np = &n->next) {
+ if (!memcmp(n->key, pkey, key_len) && n->dev == dev &&
+ net_eq(pneigh_net(n), net)) {
+ *np = n->next;
+ write_unlock_bh(&tbl->lock);
+ if (tbl->pdestructor)
+ tbl->pdestructor(n);
+ if (n->dev)
+ dev_put(n->dev);
+ kfree(n);
+ return 0;
+ }
+ }
+ write_unlock_bh(&tbl->lock);
+ return -ENOENT;
+}
+
+static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
+ struct net_device *dev)
+{
+ struct pneigh_entry *n, **np, *freelist = NULL;
+ u32 h;
+
+ for (h = 0; h <= PNEIGH_HASHMASK; h++) {
+ np = &tbl->phash_buckets[h];
+ while ((n = *np) != NULL) {
+ if (!dev || n->dev == dev) {
+ *np = n->next;
+ n->next = freelist;
+ freelist = n;
+ continue;
+ }
+ np = &n->next;
+ }
+ }
+ write_unlock_bh(&tbl->lock);
+ while ((n = freelist)) {
+ freelist = n->next;
+ n->next = NULL;
+ if (tbl->pdestructor)
+ tbl->pdestructor(n);
+ if (n->dev)
+ dev_put(n->dev);
+ kfree(n);
+ }
+ return -ENOENT;
+}
+
+static void neigh_parms_destroy(struct neigh_parms *parms);
+
+static inline void neigh_parms_put(struct neigh_parms *parms)
+{
+ if (refcount_dec_and_test(&parms->refcnt))
+ neigh_parms_destroy(parms);
+}
+
+/*
+ * neighbour must already be out of the table;
+ *
+ */
+void neigh_destroy(struct neighbour *neigh)
+{
+ struct net_device *dev = neigh->dev;
+
+ NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
+
+ if (!neigh->dead) {
+ pr_warn("Destroying alive neighbour %p\n", neigh);
+ dump_stack();
+ return;
+ }
+
+ if (neigh_del_timer(neigh))
+ pr_warn("Impossible event\n");
+
+ write_lock_bh(&neigh->lock);
+ __skb_queue_purge(&neigh->arp_queue);
+ write_unlock_bh(&neigh->lock);
+ neigh->arp_queue_len_bytes = 0;
+
+ if (dev->netdev_ops->ndo_neigh_destroy)
+ dev->netdev_ops->ndo_neigh_destroy(dev, neigh);
+
+ dev_put(dev);
+ neigh_parms_put(neigh->parms);
+
+ neigh_dbg(2, "neigh %p is destroyed\n", neigh);
+
+ atomic_dec(&neigh->tbl->entries);
+ kfree_rcu(neigh, rcu);
+}
+EXPORT_SYMBOL(neigh_destroy);
+
+/* Neighbour state is suspicious;
+ disable fast path.
+
+ Called with write_locked neigh.
+ */
+static void neigh_suspect(struct neighbour *neigh)
+{
+ neigh_dbg(2, "neigh %p is suspected\n", neigh);
+
+ neigh->output = neigh->ops->output;
+}
+
+/* Neighbour state is OK;
+ enable fast path.
+
+ Called with write_locked neigh.
+ */
+static void neigh_connect(struct neighbour *neigh)
+{
+ neigh_dbg(2, "neigh %p is connected\n", neigh);
+
+ neigh->output = neigh->ops->connected_output;
+}
+
+static void neigh_periodic_work(struct work_struct *work)
+{
+ struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
+ struct neighbour *n;
+ struct neighbour __rcu **np;
+ unsigned int i;
+ struct neigh_hash_table *nht;
+
+ NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
+
+ write_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+
+ /*
+ * periodically recompute ReachableTime from random function
+ */
+
+ if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
+ struct neigh_parms *p;
+ tbl->last_rand = jiffies;
+ list_for_each_entry(p, &tbl->parms_list, list)
+ p->reachable_time =
+ neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ }
+
+ if (atomic_read(&tbl->entries) < tbl->gc_thresh1)
+ goto out;
+
+ for (i = 0 ; i < (1 << nht->hash_shift); i++) {
+ np = &nht->hash_buckets[i];
+
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
+ unsigned int state;
+
+ write_lock(&n->lock);
+
+ state = n->nud_state;
+ if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
+ (n->flags & NTF_EXT_LEARNED)) {
+ write_unlock(&n->lock);
+ goto next_elt;
+ }
+
+ if (time_before(n->used, n->confirmed))
+ n->used = n->confirmed;
+
+ if (refcount_read(&n->refcnt) == 1 &&
+ (state == NUD_FAILED ||
+ time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
+ *np = n->next;
+ n->dead = 1;
+ write_unlock(&n->lock);
+ neigh_cleanup_and_release(n);
+ continue;
+ }
+ write_unlock(&n->lock);
+
+next_elt:
+ np = &n->next;
+ }
+ /*
+ * It's fine to release lock here, even if hash table
+ * grows while we are preempted.
+ */
+ write_unlock_bh(&tbl->lock);
+ cond_resched();
+ write_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ }
+out:
+ /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks.
+ * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2
+ * BASE_REACHABLE_TIME.
+ */
+ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
+ NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1);
+ write_unlock_bh(&tbl->lock);
+}
+
+static __inline__ int neigh_max_probes(struct neighbour *n)
+{
+ struct neigh_parms *p = n->parms;
+ return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) +
+ (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) :
+ NEIGH_VAR(p, MCAST_PROBES));
+}
+
+static void neigh_invalidate(struct neighbour *neigh)
+ __releases(neigh->lock)
+ __acquires(neigh->lock)
+{
+ struct sk_buff *skb;
+
+ NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed);
+ neigh_dbg(2, "neigh %p is failed\n", neigh);
+ neigh->updated = jiffies;
+
+ /* It is very thin place. report_unreachable is very complicated
+ routine. Particularly, it can hit the same neighbour entry!
+
+ So that, we try to be accurate and avoid dead loop. --ANK
+ */
+ while (neigh->nud_state == NUD_FAILED &&
+ (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
+ write_unlock(&neigh->lock);
+ neigh->ops->error_report(neigh, skb);
+ write_lock(&neigh->lock);
+ }
+ __skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
+}
+
+static void neigh_probe(struct neighbour *neigh)
+ __releases(neigh->lock)
+{
+ struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);
+ /* keep skb alive even if arp_queue overflows */
+ if (skb)
+ skb = skb_clone(skb, GFP_ATOMIC);
+ write_unlock(&neigh->lock);
+ if (neigh->ops->solicit)
+ neigh->ops->solicit(neigh, skb);
+ atomic_inc(&neigh->probes);
+ kfree_skb(skb);
+}
+
+/* Called when a timer expires for a neighbour entry. */
+
+static void neigh_timer_handler(struct timer_list *t)
+{
+ unsigned long now, next;
+ struct neighbour *neigh = from_timer(neigh, t, timer);
+ unsigned int state;
+ int notify = 0;
+
+ write_lock(&neigh->lock);
+
+ state = neigh->nud_state;
+ now = jiffies;
+ next = now + HZ;
+
+ if (!(state & NUD_IN_TIMER))
+ goto out;
+
+ if (state & NUD_REACHABLE) {
+ if (time_before_eq(now,
+ neigh->confirmed + neigh->parms->reachable_time)) {
+ neigh_dbg(2, "neigh %p is still alive\n", neigh);
+ next = neigh->confirmed + neigh->parms->reachable_time;
+ } else if (time_before_eq(now,
+ neigh->used +
+ NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
+ neigh_dbg(2, "neigh %p is delayed\n", neigh);
+ neigh->nud_state = NUD_DELAY;
+ neigh->updated = jiffies;
+ neigh_suspect(neigh);
+ next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
+ } else {
+ neigh_dbg(2, "neigh %p is suspected\n", neigh);
+ neigh->nud_state = NUD_STALE;
+ neigh->updated = jiffies;
+ neigh_suspect(neigh);
+ notify = 1;
+ }
+ } else if (state & NUD_DELAY) {
+ if (time_before_eq(now,
+ neigh->confirmed +
+ NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
+ neigh_dbg(2, "neigh %p is now reachable\n", neigh);
+ neigh->nud_state = NUD_REACHABLE;
+ neigh->updated = jiffies;
+ neigh_connect(neigh);
+ notify = 1;
+ next = neigh->confirmed + neigh->parms->reachable_time;
+ } else {
+ neigh_dbg(2, "neigh %p is probed\n", neigh);
+ neigh->nud_state = NUD_PROBE;
+ neigh->updated = jiffies;
+ atomic_set(&neigh->probes, 0);
+ notify = 1;
+ next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
+ }
+ } else {
+ /* NUD_PROBE|NUD_INCOMPLETE */
+ next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
+ }
+
+ if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
+ atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
+ neigh->nud_state = NUD_FAILED;
+ notify = 1;
+ neigh_invalidate(neigh);
+ goto out;
+ }
+
+ if (neigh->nud_state & NUD_IN_TIMER) {
+ if (time_before(next, jiffies + HZ/2))
+ next = jiffies + HZ/2;
+ if (!mod_timer(&neigh->timer, next))
+ neigh_hold(neigh);
+ }
+ if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
+ neigh_probe(neigh);
+ } else {
+out:
+ write_unlock(&neigh->lock);
+ }
+
+ if (notify)
+ neigh_update_notify(neigh, 0);
+
+ neigh_release(neigh);
+}
+
+int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
+{
+ int rc;
+ bool immediate_probe = false;
+
+ write_lock_bh(&neigh->lock);
+
+ rc = 0;
+ if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
+ goto out_unlock_bh;
+ if (neigh->dead)
+ goto out_dead;
+
+ if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
+ if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
+ NEIGH_VAR(neigh->parms, APP_PROBES)) {
+ unsigned long next, now = jiffies;
+
+ atomic_set(&neigh->probes,
+ NEIGH_VAR(neigh->parms, UCAST_PROBES));
+ neigh_del_timer(neigh);
+ neigh->nud_state = NUD_INCOMPLETE;
+ neigh->updated = now;
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/2);
+ neigh_add_timer(neigh, next);
+ immediate_probe = true;
+ } else {
+ neigh->nud_state = NUD_FAILED;
+ neigh->updated = jiffies;
+ write_unlock_bh(&neigh->lock);
+
+ kfree_skb(skb);
+ return 1;
+ }
+ } else if (neigh->nud_state & NUD_STALE) {
+ neigh_dbg(2, "neigh %p is delayed\n", neigh);
+ neigh_del_timer(neigh);
+ neigh->nud_state = NUD_DELAY;
+ neigh->updated = jiffies;
+ neigh_add_timer(neigh, jiffies +
+ NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
+ }
+
+ if (neigh->nud_state == NUD_INCOMPLETE) {
+ if (skb) {
+ while (neigh->arp_queue_len_bytes + skb->truesize >
+ NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) {
+ struct sk_buff *buff;
+
+ buff = __skb_dequeue(&neigh->arp_queue);
+ if (!buff)
+ break;
+ neigh->arp_queue_len_bytes -= buff->truesize;
+ kfree_skb(buff);
+ NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
+ }
+ skb_dst_force(skb);
+ __skb_queue_tail(&neigh->arp_queue, skb);
+ neigh->arp_queue_len_bytes += skb->truesize;
+ }
+ rc = 1;
+ }
+out_unlock_bh:
+ if (immediate_probe)
+ neigh_probe(neigh);
+ else
+ write_unlock(&neigh->lock);
+ local_bh_enable();
+ return rc;
+
+out_dead:
+ if (neigh->nud_state & NUD_STALE)
+ goto out_unlock_bh;
+ write_unlock_bh(&neigh->lock);
+ kfree_skb(skb);
+ return 1;
+}
+EXPORT_SYMBOL(__neigh_event_send);
+
+static void neigh_update_hhs(struct neighbour *neigh)
+{
+ struct hh_cache *hh;
+ void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
+ = NULL;
+
+ if (neigh->dev->header_ops)
+ update = neigh->dev->header_ops->cache_update;
+
+ if (update) {
+ hh = &neigh->hh;
+ if (READ_ONCE(hh->hh_len)) {
+ write_seqlock_bh(&hh->hh_lock);
+ update(hh, neigh->dev, neigh->ha);
+ write_sequnlock_bh(&hh->hh_lock);
+ }
+ }
+}
+
+
+
+/* Generic update routine.
+ -- lladdr is new lladdr or NULL, if it is not supplied.
+ -- new is new state.
+ -- flags
+ NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr,
+ if it is different.
+ NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"
+ lladdr instead of overriding it
+ if it is different.
+ NEIGH_UPDATE_F_ADMIN means that the change is administrative.
+
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
+ NTF_ROUTER flag.
+ NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as
+ a router.
+
+ Caller MUST hold reference count on the entry.
+ */
+
+int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
+ u32 flags, u32 nlmsg_pid)
+{
+ u8 old;
+ int err;
+ int notify = 0;
+ struct net_device *dev;
+ int update_isrouter = 0;
+
+ write_lock_bh(&neigh->lock);
+
+ dev = neigh->dev;
+ old = neigh->nud_state;
+ err = -EPERM;
+
+ if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
+ (old & (NUD_NOARP | NUD_PERMANENT)))
+ goto out;
+ if (neigh->dead)
+ goto out;
+
+ neigh_update_ext_learned(neigh, flags, &notify);
+
+ if (!(new & NUD_VALID)) {
+ neigh_del_timer(neigh);
+ if (old & NUD_CONNECTED)
+ neigh_suspect(neigh);
+ neigh->nud_state = new;
+ err = 0;
+ notify = old & NUD_VALID;
+ if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
+ (new & NUD_FAILED)) {
+ neigh_invalidate(neigh);
+ notify = 1;
+ }
+ goto out;
+ }
+
+ /* Compare new lladdr with cached one */
+ if (!dev->addr_len) {
+ /* First case: device needs no address. */
+ lladdr = neigh->ha;
+ } else if (lladdr) {
+ /* The second case: if something is already cached
+ and a new address is proposed:
+ - compare new & old
+ - if they are different, check override flag
+ */
+ if ((old & NUD_VALID) &&
+ !memcmp(lladdr, neigh->ha, dev->addr_len))
+ lladdr = neigh->ha;
+ } else {
+ /* No address is supplied; if we know something,
+ use it, otherwise discard the request.
+ */
+ err = -EINVAL;
+ if (!(old & NUD_VALID))
+ goto out;
+ lladdr = neigh->ha;
+ }
+
+ /* Update confirmed timestamp for neighbour entry after we
+ * received ARP packet even if it doesn't change IP to MAC binding.
+ */
+ if (new & NUD_CONNECTED)
+ neigh->confirmed = jiffies;
+
+ /* If entry was valid and address is not changed,
+ do not change entry state, if new one is STALE.
+ */
+ err = 0;
+ update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
+ if (old & NUD_VALID) {
+ if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) {
+ update_isrouter = 0;
+ if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) &&
+ (old & NUD_CONNECTED)) {
+ lladdr = neigh->ha;
+ new = NUD_STALE;
+ } else
+ goto out;
+ } else {
+ if (lladdr == neigh->ha && new == NUD_STALE &&
+ !(flags & NEIGH_UPDATE_F_ADMIN))
+ new = old;
+ }
+ }
+
+ /* Update timestamp only once we know we will make a change to the
+ * neighbour entry. Otherwise we risk to move the locktime window with
+ * noop updates and ignore relevant ARP updates.
+ */
+ if (new != old || lladdr != neigh->ha)
+ neigh->updated = jiffies;
+
+ if (new != old) {
+ neigh_del_timer(neigh);
+ if (new & NUD_PROBE)
+ atomic_set(&neigh->probes, 0);
+ if (new & NUD_IN_TIMER)
+ neigh_add_timer(neigh, (jiffies +
+ ((new & NUD_REACHABLE) ?
+ neigh->parms->reachable_time :
+ 0)));
+ neigh->nud_state = new;
+ notify = 1;
+ }
+
+ if (lladdr != neigh->ha) {
+ write_seqlock(&neigh->ha_lock);
+ memcpy(&neigh->ha, lladdr, dev->addr_len);
+ write_sequnlock(&neigh->ha_lock);
+ neigh_update_hhs(neigh);
+ if (!(new & NUD_CONNECTED))
+ neigh->confirmed = jiffies -
+ (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);
+ notify = 1;
+ }
+ if (new == old)
+ goto out;
+ if (new & NUD_CONNECTED)
+ neigh_connect(neigh);
+ else
+ neigh_suspect(neigh);
+ if (!(old & NUD_VALID)) {
+ struct sk_buff *skb;
+
+ /* Again: avoid dead loop if something went wrong */
+
+ while (neigh->nud_state & NUD_VALID &&
+ (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
+ struct dst_entry *dst = skb_dst(skb);
+ struct neighbour *n2, *n1 = neigh;
+ write_unlock_bh(&neigh->lock);
+
+ rcu_read_lock();
+
+ /* Why not just use 'neigh' as-is? The problem is that
+ * things such as shaper, eql, and sch_teql can end up
+ * using alternative, different, neigh objects to output
+ * the packet in the output path. So what we need to do
+ * here is re-lookup the top-level neigh in the path so
+ * we can reinject the packet there.
+ */
+ n2 = NULL;
+ if (dst && dst->obsolete != DST_OBSOLETE_DEAD) {
+ n2 = dst_neigh_lookup_skb(dst, skb);
+ if (n2)
+ n1 = n2;
+ }
+ n1->output(n1, skb);
+ if (n2)
+ neigh_release(n2);
+ rcu_read_unlock();
+
+ write_lock_bh(&neigh->lock);
+ }
+ __skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
+ }
+out:
+ if (update_isrouter) {
+ neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
+ (neigh->flags | NTF_ROUTER) :
+ (neigh->flags & ~NTF_ROUTER);
+ }
+ write_unlock_bh(&neigh->lock);
+
+ if (notify)
+ neigh_update_notify(neigh, nlmsg_pid);
+
+ return err;
+}
+EXPORT_SYMBOL(neigh_update);
+
+/* Update the neigh to listen temporarily for probe responses, even if it is
+ * in a NUD_FAILED state. The caller has to hold neigh->lock for writing.
+ */
+void __neigh_set_probe_once(struct neighbour *neigh)
+{
+ if (neigh->dead)
+ return;
+ neigh->updated = jiffies;
+ if (!(neigh->nud_state & NUD_FAILED))
+ return;
+ neigh->nud_state = NUD_INCOMPLETE;
+ atomic_set(&neigh->probes, neigh_max_probes(neigh));
+ neigh_add_timer(neigh,
+ jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME));
+}
+EXPORT_SYMBOL(__neigh_set_probe_once);
+
+struct neighbour *neigh_event_ns(struct neigh_table *tbl,
+ u8 *lladdr, void *saddr,
+ struct net_device *dev)
+{
+ struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev,
+ lladdr || !dev->addr_len);
+ if (neigh)
+ neigh_update(neigh, lladdr, NUD_STALE,
+ NEIGH_UPDATE_F_OVERRIDE, 0);
+ return neigh;
+}
+EXPORT_SYMBOL(neigh_event_ns);
+
+/* called with read_lock_bh(&n->lock); */
+static void neigh_hh_init(struct neighbour *n)
+{
+ struct net_device *dev = n->dev;
+ __be16 prot = n->tbl->protocol;
+ struct hh_cache *hh = &n->hh;
+
+ write_lock_bh(&n->lock);
+
+ /* Only one thread can come in here and initialize the
+ * hh_cache entry.
+ */
+ if (!hh->hh_len)
+ dev->header_ops->cache(n, hh, prot);
+
+ write_unlock_bh(&n->lock);
+}
+
+/* Slow and careful. */
+
+int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
+{
+ int rc = 0;
+
+ if (!neigh_event_send(neigh, skb)) {
+ int err;
+ struct net_device *dev = neigh->dev;
+ unsigned int seq;
+
+ if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len))
+ neigh_hh_init(neigh);
+
+ do {
+ __skb_pull(skb, skb_network_offset(skb));
+ seq = read_seqbegin(&neigh->ha_lock);
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ neigh->ha, NULL, skb->len);
+ } while (read_seqretry(&neigh->ha_lock, seq));
+
+ if (err >= 0)
+ rc = dev_queue_xmit(skb);
+ else
+ goto out_kfree_skb;
+ }
+out:
+ return rc;
+out_kfree_skb:
+ rc = -EINVAL;
+ kfree_skb(skb);
+ goto out;
+}
+EXPORT_SYMBOL(neigh_resolve_output);
+
+/* As fast as possible without hh cache */
+
+int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb)
+{
+ struct net_device *dev = neigh->dev;
+ unsigned int seq;
+ int err;
+
+ do {
+ __skb_pull(skb, skb_network_offset(skb));
+ seq = read_seqbegin(&neigh->ha_lock);
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ neigh->ha, NULL, skb->len);
+ } while (read_seqretry(&neigh->ha_lock, seq));
+
+ if (err >= 0)
+ err = dev_queue_xmit(skb);
+ else {
+ err = -EINVAL;
+ kfree_skb(skb);
+ }
+ return err;
+}
+EXPORT_SYMBOL(neigh_connected_output);
+
+int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb)
+{
+ return dev_queue_xmit(skb);
+}
+EXPORT_SYMBOL(neigh_direct_output);
+
+static void neigh_proxy_process(struct timer_list *t)
+{
+ struct neigh_table *tbl = from_timer(tbl, t, proxy_timer);
+ long sched_next = 0;
+ unsigned long now = jiffies;
+ struct sk_buff *skb, *n;
+
+ spin_lock(&tbl->proxy_queue.lock);
+
+ skb_queue_walk_safe(&tbl->proxy_queue, skb, n) {
+ long tdif = NEIGH_CB(skb)->sched_next - now;
+
+ if (tdif <= 0) {
+ struct net_device *dev = skb->dev;
+
+ __skb_unlink(skb, &tbl->proxy_queue);
+ if (tbl->proxy_redo && netif_running(dev)) {
+ rcu_read_lock();
+ tbl->proxy_redo(skb);
+ rcu_read_unlock();
+ } else {
+ kfree_skb(skb);
+ }
+
+ dev_put(dev);
+ } else if (!sched_next || tdif < sched_next)
+ sched_next = tdif;
+ }
+ del_timer(&tbl->proxy_timer);
+ if (sched_next)
+ mod_timer(&tbl->proxy_timer, jiffies + sched_next);
+ spin_unlock(&tbl->proxy_queue.lock);
+}
+
+void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
+ struct sk_buff *skb)
+{
+ unsigned long now = jiffies;
+
+ unsigned long sched_next = now + (prandom_u32() %
+ NEIGH_VAR(p, PROXY_DELAY));
+
+ if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ NEIGH_CB(skb)->sched_next = sched_next;
+ NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;
+
+ spin_lock(&tbl->proxy_queue.lock);
+ if (del_timer(&tbl->proxy_timer)) {
+ if (time_before(tbl->proxy_timer.expires, sched_next))
+ sched_next = tbl->proxy_timer.expires;
+ }
+ skb_dst_drop(skb);
+ dev_hold(skb->dev);
+ __skb_queue_tail(&tbl->proxy_queue, skb);
+ mod_timer(&tbl->proxy_timer, sched_next);
+ spin_unlock(&tbl->proxy_queue.lock);
+}
+EXPORT_SYMBOL(pneigh_enqueue);
+
+static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
+ struct net *net, int ifindex)
+{
+ struct neigh_parms *p;
+
+ list_for_each_entry(p, &tbl->parms_list, list) {
+ if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) ||
+ (!p->dev && !ifindex && net_eq(net, &init_net)))
+ return p;
+ }
+
+ return NULL;
+}
+
+struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
+ struct neigh_table *tbl)
+{
+ struct neigh_parms *p;
+ struct net *net = dev_net(dev);
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL);
+ if (p) {
+ p->tbl = tbl;
+ refcount_set(&p->refcnt, 1);
+ p->reachable_time =
+ neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ dev_hold(dev);
+ p->dev = dev;
+ write_pnet(&p->net, net);
+ p->sysctl_table = NULL;
+
+ if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
+ dev_put(dev);
+ kfree(p);
+ return NULL;
+ }
+
+ write_lock_bh(&tbl->lock);
+ list_add(&p->list, &tbl->parms.list);
+ write_unlock_bh(&tbl->lock);
+
+ neigh_parms_data_state_cleanall(p);
+ }
+ return p;
+}
+EXPORT_SYMBOL(neigh_parms_alloc);
+
+static void neigh_rcu_free_parms(struct rcu_head *head)
+{
+ struct neigh_parms *parms =
+ container_of(head, struct neigh_parms, rcu_head);
+
+ neigh_parms_put(parms);
+}
+
+void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
+{
+ if (!parms || parms == &tbl->parms)
+ return;
+ write_lock_bh(&tbl->lock);
+ list_del(&parms->list);
+ parms->dead = 1;
+ write_unlock_bh(&tbl->lock);
+ if (parms->dev)
+ dev_put(parms->dev);
+ call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
+}
+EXPORT_SYMBOL(neigh_parms_release);
+
+static void neigh_parms_destroy(struct neigh_parms *parms)
+{
+ kfree(parms);
+}
+
+static struct lock_class_key neigh_table_proxy_queue_class;
+
+static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly;
+
+void neigh_table_init(int index, struct neigh_table *tbl)
+{
+ unsigned long now = jiffies;
+ unsigned long phsize;
+
+ INIT_LIST_HEAD(&tbl->parms_list);
+ list_add(&tbl->parms.list, &tbl->parms_list);
+ write_pnet(&tbl->parms.net, &init_net);
+ refcount_set(&tbl->parms.refcnt, 1);
+ tbl->parms.reachable_time =
+ neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));
+
+ tbl->stats = alloc_percpu(struct neigh_statistics);
+ if (!tbl->stats)
+ panic("cannot create neighbour cache statistics");
+
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_seq_data(tbl->id, 0, init_net.proc_net_stat,
+ &neigh_stat_seq_ops, tbl))
+ panic("cannot create neighbour proc dir entry");
+#endif
+
+ RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3));
+
+ phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
+ tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
+
+ if (!tbl->nht || !tbl->phash_buckets)
+ panic("cannot allocate neighbour cache hashes");
+
+ if (!tbl->entry_size)
+ tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) +
+ tbl->key_len, NEIGH_PRIV_ALIGN);
+ else
+ WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN);
+
+ rwlock_init(&tbl->lock);
+ INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
+ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
+ tbl->parms.reachable_time);
+ timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0);
+ skb_queue_head_init_class(&tbl->proxy_queue,
+ &neigh_table_proxy_queue_class);
+
+ tbl->last_flush = now;
+ tbl->last_rand = now + tbl->parms.reachable_time * 20;
+
+ neigh_tables[index] = tbl;
+}
+EXPORT_SYMBOL(neigh_table_init);
+
+int neigh_table_clear(int index, struct neigh_table *tbl)
+{
+ neigh_tables[index] = NULL;
+ /* It is not clean... Fix it to unload IPv6 module safely */
+ cancel_delayed_work_sync(&tbl->gc_work);
+ del_timer_sync(&tbl->proxy_timer);
+ pneigh_queue_purge(&tbl->proxy_queue);
+ neigh_ifdown(tbl, NULL);
+ if (atomic_read(&tbl->entries))
+ pr_crit("neighbour leakage\n");
+
+ call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
+ neigh_hash_free_rcu);
+ tbl->nht = NULL;
+
+ kfree(tbl->phash_buckets);
+ tbl->phash_buckets = NULL;
+
+ remove_proc_entry(tbl->id, init_net.proc_net_stat);
+
+ free_percpu(tbl->stats);
+ tbl->stats = NULL;
+
+ return 0;
+}
+EXPORT_SYMBOL(neigh_table_clear);
+
+static struct neigh_table *neigh_find_table(int family)
+{
+ struct neigh_table *tbl = NULL;
+
+ switch (family) {
+ case AF_INET:
+ tbl = neigh_tables[NEIGH_ARP_TABLE];
+ break;
+ case AF_INET6:
+ tbl = neigh_tables[NEIGH_ND_TABLE];
+ break;
+ case AF_DECnet:
+ tbl = neigh_tables[NEIGH_DN_TABLE];
+ break;
+ }
+
+ return tbl;
+}
+
+static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *dst_attr;
+ struct neigh_table *tbl;
+ struct neighbour *neigh;
+ struct net_device *dev = NULL;
+ int err = -EINVAL;
+
+ ASSERT_RTNL();
+ if (nlmsg_len(nlh) < sizeof(*ndm))
+ goto out;
+
+ dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST);
+ if (dst_attr == NULL)
+ goto out;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex) {
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ err = -ENODEV;
+ goto out;
+ }
+ }
+
+ tbl = neigh_find_table(ndm->ndm_family);
+ if (tbl == NULL)
+ return -EAFNOSUPPORT;
+
+ if (nla_len(dst_attr) < (int)tbl->key_len)
+ goto out;
+
+ if (ndm->ndm_flags & NTF_PROXY) {
+ err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
+ goto out;
+ }
+
+ if (dev == NULL)
+ goto out;
+
+ neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
+ if (neigh == NULL) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ err = neigh_update(neigh, NULL, NUD_FAILED,
+ NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_ADMIN,
+ NETLINK_CB(skb).portid);
+ write_lock_bh(&tbl->lock);
+ neigh_release(neigh);
+ neigh_remove_one(neigh, tbl);
+ write_unlock_bh(&tbl->lock);
+
+out:
+ return err;
+}
+
+static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *tb[NDA_MAX+1];
+ struct neigh_table *tbl;
+ struct net_device *dev = NULL;
+ struct neighbour *neigh;
+ void *dst, *lladdr;
+ int err;
+
+ ASSERT_RTNL();
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
+ if (err < 0)
+ goto out;
+
+ err = -EINVAL;
+ if (tb[NDA_DST] == NULL)
+ goto out;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex) {
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len)
+ goto out;
+ }
+
+ tbl = neigh_find_table(ndm->ndm_family);
+ if (tbl == NULL)
+ return -EAFNOSUPPORT;
+
+ if (nla_len(tb[NDA_DST]) < (int)tbl->key_len)
+ goto out;
+ dst = nla_data(tb[NDA_DST]);
+ lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
+
+ if (ndm->ndm_flags & NTF_PROXY) {
+ struct pneigh_entry *pn;
+
+ err = -ENOBUFS;
+ pn = pneigh_lookup(tbl, net, dst, dev, 1);
+ if (pn) {
+ pn->flags = ndm->ndm_flags;
+ err = 0;
+ }
+ goto out;
+ }
+
+ if (dev == NULL)
+ goto out;
+
+ neigh = neigh_lookup(tbl, dst, dev);
+ if (neigh == NULL) {
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ neigh = __neigh_lookup_errno(tbl, dst, dev);
+ if (IS_ERR(neigh)) {
+ err = PTR_ERR(neigh);
+ goto out;
+ }
+ } else {
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ err = -EEXIST;
+ neigh_release(neigh);
+ goto out;
+ }
+
+ if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
+ flags &= ~NEIGH_UPDATE_F_OVERRIDE;
+ }
+
+ if (ndm->ndm_flags & NTF_EXT_LEARNED)
+ flags |= NEIGH_UPDATE_F_EXT_LEARNED;
+
+ if (ndm->ndm_flags & NTF_USE) {
+ neigh_event_send(neigh, NULL);
+ err = 0;
+ } else
+ err = neigh_update(neigh, lladdr, ndm->ndm_state, flags,
+ NETLINK_CB(skb).portid);
+ neigh_release(neigh);
+
+out:
+ return err;
+}
+
+static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NDTA_PARMS);
+ if (nest == NULL)
+ return -ENOBUFS;
+
+ if ((parms->dev &&
+ nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) ||
+ nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) ||
+ nla_put_u32(skb, NDTPA_QUEUE_LENBYTES,
+ NEIGH_VAR(parms, QUEUE_LEN_BYTES)) ||
+ /* approximative value for deprecated QUEUE_LEN (in packets) */
+ nla_put_u32(skb, NDTPA_QUEUE_LEN,
+ NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) ||
+ nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) ||
+ nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) ||
+ nla_put_u32(skb, NDTPA_UCAST_PROBES,
+ NEIGH_VAR(parms, UCAST_PROBES)) ||
+ nla_put_u32(skb, NDTPA_MCAST_PROBES,
+ NEIGH_VAR(parms, MCAST_PROBES)) ||
+ nla_put_u32(skb, NDTPA_MCAST_REPROBES,
+ NEIGH_VAR(parms, MCAST_REPROBES)) ||
+ nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time,
+ NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME,
+ NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_GC_STALETIME,
+ NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME,
+ NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_RETRANS_TIME,
+ NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_ANYCAST_DELAY,
+ NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_PROXY_DELAY,
+ NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_LOCKTIME,
+ NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD))
+ goto nla_put_failure;
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
+ u32 pid, u32 seq, int type, int flags)
+{
+ struct nlmsghdr *nlh;
+ struct ndtmsg *ndtmsg;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndtmsg = nlmsg_data(nlh);
+
+ read_lock_bh(&tbl->lock);
+ ndtmsg->ndtm_family = tbl->family;
+ ndtmsg->ndtm_pad1 = 0;
+ ndtmsg->ndtm_pad2 = 0;
+
+ if (nla_put_string(skb, NDTA_NAME, tbl->id) ||
+ nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval, NDTA_PAD) ||
+ nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) ||
+ nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) ||
+ nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3))
+ goto nla_put_failure;
+ {
+ unsigned long now = jiffies;
+ long flush_delta = now - tbl->last_flush;
+ long rand_delta = now - tbl->last_rand;
+ struct neigh_hash_table *nht;
+ struct ndt_config ndc = {
+ .ndtc_key_len = tbl->key_len,
+ .ndtc_entry_size = tbl->entry_size,
+ .ndtc_entries = atomic_read(&tbl->entries),
+ .ndtc_last_flush = jiffies_to_msecs(flush_delta),
+ .ndtc_last_rand = jiffies_to_msecs(rand_delta),
+ .ndtc_proxy_qlen = tbl->proxy_queue.qlen,
+ };
+
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+ ndc.ndtc_hash_rnd = nht->hash_rnd[0];
+ ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);
+ rcu_read_unlock_bh();
+
+ if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc))
+ goto nla_put_failure;
+ }
+
+ {
+ int cpu;
+ struct ndt_stats ndst;
+
+ memset(&ndst, 0, sizeof(ndst));
+
+ for_each_possible_cpu(cpu) {
+ struct neigh_statistics *st;
+
+ st = per_cpu_ptr(tbl->stats, cpu);
+ ndst.ndts_allocs += st->allocs;
+ ndst.ndts_destroys += st->destroys;
+ ndst.ndts_hash_grows += st->hash_grows;
+ ndst.ndts_res_failed += st->res_failed;
+ ndst.ndts_lookups += st->lookups;
+ ndst.ndts_hits += st->hits;
+ ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast;
+ ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast;
+ ndst.ndts_periodic_gc_runs += st->periodic_gc_runs;
+ ndst.ndts_forced_gc_runs += st->forced_gc_runs;
+ ndst.ndts_table_fulls += st->table_fulls;
+ }
+
+ if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst,
+ NDTA_PAD))
+ goto nla_put_failure;
+ }
+
+ BUG_ON(tbl->parms.dev);
+ if (neightbl_fill_parms(skb, &tbl->parms) < 0)
+ goto nla_put_failure;
+
+ read_unlock_bh(&tbl->lock);
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ read_unlock_bh(&tbl->lock);
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int neightbl_fill_param_info(struct sk_buff *skb,
+ struct neigh_table *tbl,
+ struct neigh_parms *parms,
+ u32 pid, u32 seq, int type,
+ unsigned int flags)
+{
+ struct ndtmsg *ndtmsg;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndtmsg = nlmsg_data(nlh);
+
+ read_lock_bh(&tbl->lock);
+ ndtmsg->ndtm_family = tbl->family;
+ ndtmsg->ndtm_pad1 = 0;
+ ndtmsg->ndtm_pad2 = 0;
+
+ if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 ||
+ neightbl_fill_parms(skb, parms) < 0)
+ goto errout;
+
+ read_unlock_bh(&tbl->lock);
+ nlmsg_end(skb, nlh);
+ return 0;
+errout:
+ read_unlock_bh(&tbl->lock);
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = {
+ [NDTA_NAME] = { .type = NLA_STRING },
+ [NDTA_THRESH1] = { .type = NLA_U32 },
+ [NDTA_THRESH2] = { .type = NLA_U32 },
+ [NDTA_THRESH3] = { .type = NLA_U32 },
+ [NDTA_GC_INTERVAL] = { .type = NLA_U64 },
+ [NDTA_PARMS] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
+ [NDTPA_IFINDEX] = { .type = NLA_U32 },
+ [NDTPA_QUEUE_LEN] = { .type = NLA_U32 },
+ [NDTPA_PROXY_QLEN] = { .type = NLA_U32 },
+ [NDTPA_APP_PROBES] = { .type = NLA_U32 },
+ [NDTPA_UCAST_PROBES] = { .type = NLA_U32 },
+ [NDTPA_MCAST_PROBES] = { .type = NLA_U32 },
+ [NDTPA_MCAST_REPROBES] = { .type = NLA_U32 },
+ [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 },
+ [NDTPA_GC_STALETIME] = { .type = NLA_U64 },
+ [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 },
+ [NDTPA_RETRANS_TIME] = { .type = NLA_U64 },
+ [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 },
+ [NDTPA_PROXY_DELAY] = { .type = NLA_U64 },
+ [NDTPA_LOCKTIME] = { .type = NLA_U64 },
+};
+
+static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct neigh_table *tbl;
+ struct ndtmsg *ndtmsg;
+ struct nlattr *tb[NDTA_MAX+1];
+ bool found = false;
+ int err, tidx;
+
+ err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
+ nl_neightbl_policy, extack);
+ if (err < 0)
+ goto errout;
+
+ if (tb[NDTA_NAME] == NULL) {
+ err = -EINVAL;
+ goto errout;
+ }
+
+ ndtmsg = nlmsg_data(nlh);
+
+ for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
+ tbl = neigh_tables[tidx];
+ if (!tbl)
+ continue;
+ if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
+ continue;
+ if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ return -ENOENT;
+
+ /*
+ * We acquire tbl->lock to be nice to the periodic timers and
+ * make sure they always see a consistent set of values.
+ */
+ write_lock_bh(&tbl->lock);
+
+ if (tb[NDTA_PARMS]) {
+ struct nlattr *tbp[NDTPA_MAX+1];
+ struct neigh_parms *p;
+ int i, ifindex = 0;
+
+ err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS],
+ nl_ntbl_parm_policy, extack);
+ if (err < 0)
+ goto errout_tbl_lock;
+
+ if (tbp[NDTPA_IFINDEX])
+ ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]);
+
+ p = lookup_neigh_parms(tbl, net, ifindex);
+ if (p == NULL) {
+ err = -ENOENT;
+ goto errout_tbl_lock;
+ }
+
+ for (i = 1; i <= NDTPA_MAX; i++) {
+ if (tbp[i] == NULL)
+ continue;
+
+ switch (i) {
+ case NDTPA_QUEUE_LEN:
+ NEIGH_VAR_SET(p, QUEUE_LEN_BYTES,
+ nla_get_u32(tbp[i]) *
+ SKB_TRUESIZE(ETH_FRAME_LEN));
+ break;
+ case NDTPA_QUEUE_LENBYTES:
+ NEIGH_VAR_SET(p, QUEUE_LEN_BYTES,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_PROXY_QLEN:
+ NEIGH_VAR_SET(p, PROXY_QLEN,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_APP_PROBES:
+ NEIGH_VAR_SET(p, APP_PROBES,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_UCAST_PROBES:
+ NEIGH_VAR_SET(p, UCAST_PROBES,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_MCAST_PROBES:
+ NEIGH_VAR_SET(p, MCAST_PROBES,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_MCAST_REPROBES:
+ NEIGH_VAR_SET(p, MCAST_REPROBES,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_BASE_REACHABLE_TIME:
+ NEIGH_VAR_SET(p, BASE_REACHABLE_TIME,
+ nla_get_msecs(tbp[i]));
+ /* update reachable_time as well, otherwise, the change will
+ * only be effective after the next time neigh_periodic_work
+ * decides to recompute it (can be multiple minutes)
+ */
+ p->reachable_time =
+ neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ break;
+ case NDTPA_GC_STALETIME:
+ NEIGH_VAR_SET(p, GC_STALETIME,
+ nla_get_msecs(tbp[i]));
+ break;
+ case NDTPA_DELAY_PROBE_TIME:
+ NEIGH_VAR_SET(p, DELAY_PROBE_TIME,
+ nla_get_msecs(tbp[i]));
+ call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
+ break;
+ case NDTPA_RETRANS_TIME:
+ NEIGH_VAR_SET(p, RETRANS_TIME,
+ nla_get_msecs(tbp[i]));
+ break;
+ case NDTPA_ANYCAST_DELAY:
+ NEIGH_VAR_SET(p, ANYCAST_DELAY,
+ nla_get_msecs(tbp[i]));
+ break;
+ case NDTPA_PROXY_DELAY:
+ NEIGH_VAR_SET(p, PROXY_DELAY,
+ nla_get_msecs(tbp[i]));
+ break;
+ case NDTPA_LOCKTIME:
+ NEIGH_VAR_SET(p, LOCKTIME,
+ nla_get_msecs(tbp[i]));
+ break;
+ }
+ }
+ }
+
+ err = -ENOENT;
+ if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] ||
+ tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) &&
+ !net_eq(net, &init_net))
+ goto errout_tbl_lock;
+
+ if (tb[NDTA_THRESH1])
+ tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]);
+
+ if (tb[NDTA_THRESH2])
+ tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]);
+
+ if (tb[NDTA_THRESH3])
+ tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]);
+
+ if (tb[NDTA_GC_INTERVAL])
+ tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]);
+
+ err = 0;
+
+errout_tbl_lock:
+ write_unlock_bh(&tbl->lock);
+errout:
+ return err;
+}
+
+static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int family, tidx, nidx = 0;
+ int tbl_skip = cb->args[0];
+ int neigh_skip = cb->args[1];
+ struct neigh_table *tbl;
+
+ family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+
+ for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
+ struct neigh_parms *p;
+
+ tbl = neigh_tables[tidx];
+ if (!tbl)
+ continue;
+
+ if (tidx < tbl_skip || (family && tbl->family != family))
+ continue;
+
+ if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
+ NLM_F_MULTI) < 0)
+ break;
+
+ nidx = 0;
+ p = list_next_entry(&tbl->parms, list);
+ list_for_each_entry_from(p, &tbl->parms_list, list) {
+ if (!net_eq(neigh_parms_net(p), net))
+ continue;
+
+ if (nidx < neigh_skip)
+ goto next;
+
+ if (neightbl_fill_param_info(skb, tbl, p,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGHTBL,
+ NLM_F_MULTI) < 0)
+ goto out;
+ next:
+ nidx++;
+ }
+
+ neigh_skip = 0;
+ }
+out:
+ cb->args[0] = tidx;
+ cb->args[1] = nidx;
+
+ return skb->len;
+}
+
+static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
+ u32 pid, u32 seq, int type, unsigned int flags)
+{
+ unsigned long now = jiffies;
+ struct nda_cacheinfo ci;
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = neigh->ops->family;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = neigh->flags;
+ ndm->ndm_type = neigh->type;
+ ndm->ndm_ifindex = neigh->dev->ifindex;
+
+ if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key))
+ goto nla_put_failure;
+
+ read_lock_bh(&neigh->lock);
+ ndm->ndm_state = neigh->nud_state;
+ if (neigh->nud_state & NUD_VALID) {
+ char haddr[MAX_ADDR_LEN];
+
+ neigh_ha_snapshot(haddr, neigh, neigh->dev);
+ if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) {
+ read_unlock_bh(&neigh->lock);
+ goto nla_put_failure;
+ }
+ }
+
+ ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
+ ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
+ ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated);
+ ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1;
+ read_unlock_bh(&neigh->lock);
+
+ if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) ||
+ nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
+ u32 pid, u32 seq, int type, unsigned int flags,
+ struct neigh_table *tbl)
+{
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = tbl->family;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = pn->flags | NTF_PROXY;
+ ndm->ndm_type = RTN_UNICAST;
+ ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0;
+ ndm->ndm_state = NUD_NONE;
+
+ if (nla_put(skb, NDA_DST, tbl->key_len, pn->key))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid)
+{
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
+ __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid);
+}
+
+static bool neigh_master_filtered(struct net_device *dev, int master_idx)
+{
+ struct net_device *master;
+
+ if (!master_idx)
+ return false;
+
+ master = netdev_master_upper_dev_get(dev);
+ if (!master || master->ifindex != master_idx)
+ return true;
+
+ return false;
+}
+
+static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx)
+{
+ if (filter_idx && dev->ifindex != filter_idx)
+ return true;
+
+ return false;
+}
+
+static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct nlattr *tb[NDA_MAX + 1];
+ struct neighbour *n;
+ int rc, h, s_h = cb->args[1];
+ int idx, s_idx = idx = cb->args[2];
+ struct neigh_hash_table *nht;
+ int filter_master_idx = 0, filter_idx = 0;
+ unsigned int flags = NLM_F_MULTI;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL);
+ if (!err) {
+ if (tb[NDA_IFINDEX]) {
+ if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
+ return -EINVAL;
+ filter_idx = nla_get_u32(tb[NDA_IFINDEX]);
+ }
+ if (tb[NDA_MASTER]) {
+ if (nla_len(tb[NDA_MASTER]) != sizeof(u32))
+ return -EINVAL;
+ filter_master_idx = nla_get_u32(tb[NDA_MASTER]);
+ }
+ if (filter_idx || filter_master_idx)
+ flags |= NLM_F_DUMP_FILTERED;
+ }
+
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+
+ for (h = s_h; h < (1 << nht->hash_shift); h++) {
+ if (h > s_h)
+ s_idx = 0;
+ for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
+ n != NULL;
+ n = rcu_dereference_bh(n->next)) {
+ if (idx < s_idx || !net_eq(dev_net(n->dev), net))
+ goto next;
+ if (neigh_ifindex_filtered(n->dev, filter_idx) ||
+ neigh_master_filtered(n->dev, filter_master_idx))
+ goto next;
+ if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH,
+ flags) < 0) {
+ rc = -1;
+ goto out;
+ }
+next:
+ idx++;
+ }
+ }
+ rc = skb->len;
+out:
+ rcu_read_unlock_bh();
+ cb->args[1] = h;
+ cb->args[2] = idx;
+ return rc;
+}
+
+static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct pneigh_entry *n;
+ struct net *net = sock_net(skb->sk);
+ int rc, h, s_h = cb->args[3];
+ int idx, s_idx = idx = cb->args[4];
+
+ read_lock_bh(&tbl->lock);
+
+ for (h = s_h; h <= PNEIGH_HASHMASK; h++) {
+ if (h > s_h)
+ s_idx = 0;
+ for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
+ if (idx < s_idx || pneigh_net(n) != net)
+ goto next;
+ if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH,
+ NLM_F_MULTI, tbl) < 0) {
+ read_unlock_bh(&tbl->lock);
+ rc = -1;
+ goto out;
+ }
+ next:
+ idx++;
+ }
+ }
+
+ read_unlock_bh(&tbl->lock);
+ rc = skb->len;
+out:
+ cb->args[3] = h;
+ cb->args[4] = idx;
+ return rc;
+
+}
+
+static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct neigh_table *tbl;
+ int t, family, s_t;
+ int proxy = 0;
+ int err;
+
+ family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+
+ /* check for full ndmsg structure presence, family member is
+ * the same for both structures
+ */
+ if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) &&
+ ((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY)
+ proxy = 1;
+
+ s_t = cb->args[0];
+
+ for (t = 0; t < NEIGH_NR_TABLES; t++) {
+ tbl = neigh_tables[t];
+
+ if (!tbl)
+ continue;
+ if (t < s_t || (family && tbl->family != family))
+ continue;
+ if (t > s_t)
+ memset(&cb->args[1], 0, sizeof(cb->args) -
+ sizeof(cb->args[0]));
+ if (proxy)
+ err = pneigh_dump_table(tbl, skb, cb);
+ else
+ err = neigh_dump_table(tbl, skb, cb);
+ if (err < 0)
+ break;
+ }
+
+ cb->args[0] = t;
+ return skb->len;
+}
+
+void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
+{
+ int chain;
+ struct neigh_hash_table *nht;
+
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+
+ read_lock(&tbl->lock); /* avoid resizes */
+ for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
+ struct neighbour *n;
+
+ for (n = rcu_dereference_bh(nht->hash_buckets[chain]);
+ n != NULL;
+ n = rcu_dereference_bh(n->next))
+ cb(n, cookie);
+ }
+ read_unlock(&tbl->lock);
+ rcu_read_unlock_bh();
+}
+EXPORT_SYMBOL(neigh_for_each);
+
+/* The tbl->lock must be held as a writer and BH disabled. */
+void __neigh_for_each_release(struct neigh_table *tbl,
+ int (*cb)(struct neighbour *))
+{
+ int chain;
+ struct neigh_hash_table *nht;
+
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
+ struct neighbour *n;
+ struct neighbour __rcu **np;
+
+ np = &nht->hash_buckets[chain];
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
+ int release;
+
+ write_lock(&n->lock);
+ release = cb(n);
+ if (release) {
+ rcu_assign_pointer(*np,
+ rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock)));
+ n->dead = 1;
+ } else
+ np = &n->next;
+ write_unlock(&n->lock);
+ if (release)
+ neigh_cleanup_and_release(n);
+ }
+ }
+}
+EXPORT_SYMBOL(__neigh_for_each_release);
+
+int neigh_xmit(int index, struct net_device *dev,
+ const void *addr, struct sk_buff *skb)
+{
+ int err = -EAFNOSUPPORT;
+ if (likely(index < NEIGH_NR_TABLES)) {
+ struct neigh_table *tbl;
+ struct neighbour *neigh;
+
+ tbl = neigh_tables[index];
+ if (!tbl)
+ goto out;
+ rcu_read_lock_bh();
+ if (index == NEIGH_ARP_TABLE) {
+ u32 key = *((u32 *)addr);
+
+ neigh = __ipv4_neigh_lookup_noref(dev, key);
+ } else {
+ neigh = __neigh_lookup_noref(tbl, addr, dev);
+ }
+ if (!neigh)
+ neigh = __neigh_create(tbl, addr, dev, false);
+ err = PTR_ERR(neigh);
+ if (IS_ERR(neigh)) {
+ rcu_read_unlock_bh();
+ goto out_kfree_skb;
+ }
+ err = neigh->output(neigh, skb);
+ rcu_read_unlock_bh();
+ }
+ else if (index == NEIGH_LINK_TABLE) {
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ addr, NULL, skb->len);
+ if (err < 0)
+ goto out_kfree_skb;
+ err = dev_queue_xmit(skb);
+ }
+out:
+ return err;
+out_kfree_skb:
+ kfree_skb(skb);
+ goto out;
+}
+EXPORT_SYMBOL(neigh_xmit);
+
+#ifdef CONFIG_PROC_FS
+
+static struct neighbour *neigh_get_first(struct seq_file *seq)
+{
+ struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct neigh_hash_table *nht = state->nht;
+ struct neighbour *n = NULL;
+ int bucket = state->bucket;
+
+ state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
+ for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {
+ n = rcu_dereference_bh(nht->hash_buckets[bucket]);
+
+ while (n) {
+ if (!net_eq(dev_net(n->dev), net))
+ goto next;
+ if (state->neigh_sub_iter) {
+ loff_t fakep = 0;
+ void *v;
+
+ v = state->neigh_sub_iter(state, n, &fakep);
+ if (!v)
+ goto next;
+ }
+ if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
+ break;
+ if (n->nud_state & ~NUD_NOARP)
+ break;
+next:
+ n = rcu_dereference_bh(n->next);
+ }
+
+ if (n)
+ break;
+ }
+ state->bucket = bucket;
+
+ return n;
+}
+
+static struct neighbour *neigh_get_next(struct seq_file *seq,
+ struct neighbour *n,
+ loff_t *pos)
+{
+ struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct neigh_hash_table *nht = state->nht;
+
+ if (state->neigh_sub_iter) {
+ void *v = state->neigh_sub_iter(state, n, pos);
+ if (v)
+ return n;
+ }
+ n = rcu_dereference_bh(n->next);
+
+ while (1) {
+ while (n) {
+ if (!net_eq(dev_net(n->dev), net))
+ goto next;
+ if (state->neigh_sub_iter) {
+ void *v = state->neigh_sub_iter(state, n, pos);
+ if (v)
+ return n;
+ goto next;
+ }
+ if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
+ break;
+
+ if (n->nud_state & ~NUD_NOARP)
+ break;
+next:
+ n = rcu_dereference_bh(n->next);
+ }
+
+ if (n)
+ break;
+
+ if (++state->bucket >= (1 << nht->hash_shift))
+ break;
+
+ n = rcu_dereference_bh(nht->hash_buckets[state->bucket]);
+ }
+
+ if (n && pos)
+ --(*pos);
+ return n;
+}
+
+static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos)
+{
+ struct neighbour *n = neigh_get_first(seq);
+
+ if (n) {
+ --(*pos);
+ while (*pos) {
+ n = neigh_get_next(seq, n, pos);
+ if (!n)
+ break;
+ }
+ }
+ return *pos ? NULL : n;
+}
+
+static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
+{
+ struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct neigh_table *tbl = state->tbl;
+ struct pneigh_entry *pn = NULL;
+ int bucket = state->bucket;
+
+ state->flags |= NEIGH_SEQ_IS_PNEIGH;
+ for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
+ pn = tbl->phash_buckets[bucket];
+ while (pn && !net_eq(pneigh_net(pn), net))
+ pn = pn->next;
+ if (pn)
+ break;
+ }
+ state->bucket = bucket;
+
+ return pn;
+}
+
+static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
+ struct pneigh_entry *pn,
+ loff_t *pos)
+{
+ struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct neigh_table *tbl = state->tbl;
+
+ do {
+ pn = pn->next;
+ } while (pn && !net_eq(pneigh_net(pn), net));
+
+ while (!pn) {
+ if (++state->bucket > PNEIGH_HASHMASK)
+ break;
+ pn = tbl->phash_buckets[state->bucket];
+ while (pn && !net_eq(pneigh_net(pn), net))
+ pn = pn->next;
+ if (pn)
+ break;
+ }
+
+ if (pn && pos)
+ --(*pos);
+
+ return pn;
+}
+
+static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos)
+{
+ struct pneigh_entry *pn = pneigh_get_first(seq);
+
+ if (pn) {
+ --(*pos);
+ while (*pos) {
+ pn = pneigh_get_next(seq, pn, pos);
+ if (!pn)
+ break;
+ }
+ }
+ return *pos ? NULL : pn;
+}
+
+static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
+{
+ struct neigh_seq_state *state = seq->private;
+ void *rc;
+ loff_t idxpos = *pos;
+
+ rc = neigh_get_idx(seq, &idxpos);
+ if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY))
+ rc = pneigh_get_idx(seq, &idxpos);
+
+ return rc;
+}
+
+void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
+ __acquires(tbl->lock)
+ __acquires(rcu_bh)
+{
+ struct neigh_seq_state *state = seq->private;
+
+ state->tbl = tbl;
+ state->bucket = 0;
+ state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
+
+ rcu_read_lock_bh();
+ state->nht = rcu_dereference_bh(tbl->nht);
+ read_lock(&tbl->lock);
+
+ return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
+}
+EXPORT_SYMBOL(neigh_seq_start);
+
+void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct neigh_seq_state *state;
+ void *rc;
+
+ if (v == SEQ_START_TOKEN) {
+ rc = neigh_get_first(seq);
+ goto out;
+ }
+
+ state = seq->private;
+ if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) {
+ rc = neigh_get_next(seq, v, NULL);
+ if (rc)
+ goto out;
+ if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY))
+ rc = pneigh_get_first(seq);
+ } else {
+ BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY);
+ rc = pneigh_get_next(seq, v, NULL);
+ }
+out:
+ ++(*pos);
+ return rc;
+}
+EXPORT_SYMBOL(neigh_seq_next);
+
+void neigh_seq_stop(struct seq_file *seq, void *v)
+ __releases(tbl->lock)
+ __releases(rcu_bh)
+{
+ struct neigh_seq_state *state = seq->private;
+ struct neigh_table *tbl = state->tbl;
+
+ read_unlock(&tbl->lock);
+ rcu_read_unlock_bh();
+}
+EXPORT_SYMBOL(neigh_seq_stop);
+
+/* statistics via seq_file */
+
+static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct neigh_table *tbl = PDE_DATA(file_inode(seq->file));
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return per_cpu_ptr(tbl->stats, cpu);
+ }
+ return NULL;
+}
+
+static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct neigh_table *tbl = PDE_DATA(file_inode(seq->file));
+ int cpu;
+
+ for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return per_cpu_ptr(tbl->stats, cpu);
+ }
+ (*pos)++;
+ return NULL;
+}
+
+static void neigh_stat_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int neigh_stat_seq_show(struct seq_file *seq, void *v)
+{
+ struct neigh_table *tbl = PDE_DATA(file_inode(seq->file));
+ struct neigh_statistics *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n");
+ return 0;
+ }
+
+ seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
+ "%08lx %08lx %08lx %08lx %08lx %08lx\n",
+ atomic_read(&tbl->entries),
+
+ st->allocs,
+ st->destroys,
+ st->hash_grows,
+
+ st->lookups,
+ st->hits,
+
+ st->res_failed,
+
+ st->rcv_probes_mcast,
+ st->rcv_probes_ucast,
+
+ st->periodic_gc_runs,
+ st->forced_gc_runs,
+ st->unres_discards,
+ st->table_fulls
+ );
+
+ return 0;
+}
+
+static const struct seq_operations neigh_stat_seq_ops = {
+ .start = neigh_stat_seq_start,
+ .next = neigh_stat_seq_next,
+ .stop = neigh_stat_seq_stop,
+ .show = neigh_stat_seq_show,
+};
+#endif /* CONFIG_PROC_FS */
+
+static inline size_t neigh_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg))
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
+ + nla_total_size(sizeof(struct nda_cacheinfo))
+ + nla_total_size(4); /* NDA_PROBES */
+}
+
+static void __neigh_notify(struct neighbour *n, int type, int flags,
+ u32 pid)
+{
+ struct net *net = dev_net(n->dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ err = neigh_fill_info(skb, n, pid, 0, type, flags);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+void neigh_app_ns(struct neighbour *n)
+{
+ __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0);
+}
+EXPORT_SYMBOL(neigh_app_ns);
+
+#ifdef CONFIG_SYSCTL
+static int zero;
+static int int_max = INT_MAX;
+static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
+
+static int proc_unres_qlen(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int size, ret;
+ struct ctl_table tmp = *ctl;
+
+ tmp.extra1 = &zero;
+ tmp.extra2 = &unres_qlen_max;
+ tmp.data = &size;
+
+ size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN);
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && !ret)
+ *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
+ return ret;
+}
+
+static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
+ int family)
+{
+ switch (family) {
+ case AF_INET:
+ return __in_dev_arp_parms_get_rcu(dev);
+ case AF_INET6:
+ return __in6_dev_nd_parms_get_rcu(dev);
+ }
+ return NULL;
+}
+
+static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p,
+ int index)
+{
+ struct net_device *dev;
+ int family = neigh_parms_family(p);
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ struct neigh_parms *dst_p =
+ neigh_get_dev_parms_rcu(dev, family);
+
+ if (dst_p && !test_bit(index, dst_p->data_state))
+ dst_p->data[index] = p->data[index];
+ }
+ rcu_read_unlock();
+}
+
+static void neigh_proc_update(struct ctl_table *ctl, int write)
+{
+ struct net_device *dev = ctl->extra1;
+ struct neigh_parms *p = ctl->extra2;
+ struct net *net = neigh_parms_net(p);
+ int index = (int *) ctl->data - p->data;
+
+ if (!write)
+ return;
+
+ set_bit(index, p->data_state);
+ if (index == NEIGH_VAR_DELAY_PROBE_TIME)
+ call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
+ if (!dev) /* NULL dev means this is default value */
+ neigh_copy_dflt_parms(net, p, index);
+}
+
+static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp = *ctl;
+ int ret;
+
+ tmp.extra1 = &zero;
+ tmp.extra2 = &int_max;
+
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+int neigh_proc_dointvec(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+EXPORT_SYMBOL(neigh_proc_dointvec);
+
+int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+EXPORT_SYMBOL(neigh_proc_dointvec_jiffies);
+
+static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies);
+
+static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct neigh_parms *p = ctl->extra2;
+ int ret;
+
+ if (strcmp(ctl->procname, "base_reachable_time") == 0)
+ ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
+ else if (strcmp(ctl->procname, "base_reachable_time_ms") == 0)
+ ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
+ else
+ ret = -1;
+
+ if (write && ret == 0) {
+ /* update reachable_time as well, otherwise, the change will
+ * only be effective after the next time neigh_periodic_work
+ * decides to recompute it
+ */
+ p->reachable_time =
+ neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ }
+ return ret;
+}
+
+#define NEIGH_PARMS_DATA_OFFSET(index) \
+ (&((struct neigh_parms *) 0)->data[index])
+
+#define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \
+ [NEIGH_VAR_ ## attr] = { \
+ .procname = name, \
+ .data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \
+ .maxlen = sizeof(int), \
+ .mode = mval, \
+ .proc_handler = proc, \
+ }
+
+#define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax)
+
+#define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies)
+
+#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies)
+
+#define NEIGH_SYSCTL_MS_JIFFIES_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
+
+#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
+
+#define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen)
+
+static struct neigh_sysctl_table {
+ struct ctl_table_header *sysctl_header;
+ struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
+} neigh_sysctl_template __read_mostly = {
+ .neigh_vars = {
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"),
+ NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"),
+ NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"),
+ NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"),
+ NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"),
+ NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"),
+ NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"),
+ [NEIGH_VAR_GC_INTERVAL] = {
+ .procname = "gc_interval",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NEIGH_VAR_GC_THRESH1] = {
+ .procname = "gc_thresh1",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ .proc_handler = proc_dointvec_minmax,
+ },
+ [NEIGH_VAR_GC_THRESH2] = {
+ .procname = "gc_thresh2",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ .proc_handler = proc_dointvec_minmax,
+ },
+ [NEIGH_VAR_GC_THRESH3] = {
+ .procname = "gc_thresh3",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ .proc_handler = proc_dointvec_minmax,
+ },
+ {},
+ },
+};
+
+int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
+ proc_handler *handler)
+{
+ int i;
+ struct neigh_sysctl_table *t;
+ const char *dev_name_source;
+ char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ];
+ char *p_name;
+
+ t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL);
+ if (!t)
+ goto err;
+
+ for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) {
+ t->neigh_vars[i].data += (long) p;
+ t->neigh_vars[i].extra1 = dev;
+ t->neigh_vars[i].extra2 = p;
+ }
+
+ if (dev) {
+ dev_name_source = dev->name;
+ /* Terminate the table early */
+ memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
+ sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
+ } else {
+ struct neigh_table *tbl = p->tbl;
+ dev_name_source = "default";
+ t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3;
+ }
+
+ if (handler) {
+ /* RetransTime */
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;
+ /* ReachableTime */
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;
+ /* RetransTime (in milliseconds)*/
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;
+ /* ReachableTime (in milliseconds) */
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;
+ } else {
+ /* Those handlers will update p->reachable_time after
+ * base_reachable_time(_ms) is set to ensure the new timer starts being
+ * applied after the next neighbour update instead of waiting for
+ * neigh_periodic_work to update its value (can be multiple minutes)
+ * So any handler that replaces them should do this as well
+ */
+ /* ReachableTime */
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler =
+ neigh_proc_base_reachable_time;
+ /* ReachableTime (in milliseconds) */
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler =
+ neigh_proc_base_reachable_time;
+ }
+
+ /* Don't export sysctls to unprivileged users */
+ if (neigh_parms_net(p)->user_ns != &init_user_ns)
+ t->neigh_vars[0].procname = NULL;
+
+ switch (neigh_parms_family(p)) {
+ case AF_INET:
+ p_name = "ipv4";
+ break;
+ case AF_INET6:
+ p_name = "ipv6";
+ break;
+ default:
+ BUG();
+ }
+
+ snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s",
+ p_name, dev_name_source);
+ t->sysctl_header =
+ register_net_sysctl(neigh_parms_net(p), neigh_path, t->neigh_vars);
+ if (!t->sysctl_header)
+ goto free;
+
+ p->sysctl_table = t;
+ return 0;
+
+free:
+ kfree(t);
+err:
+ return -ENOBUFS;
+}
+EXPORT_SYMBOL(neigh_sysctl_register);
+
+void neigh_sysctl_unregister(struct neigh_parms *p)
+{
+ if (p->sysctl_table) {
+ struct neigh_sysctl_table *t = p->sysctl_table;
+ p->sysctl_table = NULL;
+ unregister_net_sysctl_table(t->sysctl_header);
+ kfree(t);
+ }
+}
+EXPORT_SYMBOL(neigh_sysctl_unregister);
+
+#endif /* CONFIG_SYSCTL */
+
+static int __init neigh_init(void)
+{
+ rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, 0);
+
+ rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info,
+ 0);
+ rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, 0);
+
+ return 0;
+}
+
+subsys_initcall(neigh_init);
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
new file mode 100644
index 000000000..2808c5f9c
--- /dev/null
+++ b/net/core/net-procfs.c
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/wext.h>
+
+#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
+
+#define get_bucket(x) ((x) >> BUCKET_SPACE)
+#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
+#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
+
+extern struct list_head ptype_all __read_mostly;
+extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+
+static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct net_device *dev;
+ struct hlist_head *h;
+ unsigned int count = 0, offset = get_offset(*pos);
+
+ h = &net->dev_name_head[get_bucket(*pos)];
+ hlist_for_each_entry_rcu(dev, h, name_hlist) {
+ if (++count == offset)
+ return dev;
+ }
+
+ return NULL;
+}
+
+static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
+{
+ struct net_device *dev;
+ unsigned int bucket;
+
+ do {
+ dev = dev_from_same_bucket(seq, pos);
+ if (dev)
+ return dev;
+
+ bucket = get_bucket(*pos) + 1;
+ *pos = set_bucket_offset(bucket, 1);
+ } while (bucket < NETDEV_HASHENTRIES);
+
+ return NULL;
+}
+
+/*
+ * This is invoked by the /proc filesystem handler to display a device
+ * in detail.
+ */
+static void *dev_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ if (!*pos)
+ return SEQ_START_TOKEN;
+
+ if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
+ return NULL;
+
+ return dev_from_bucket(seq, pos);
+}
+
+static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return dev_from_bucket(seq, pos);
+}
+
+static void dev_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
+{
+ struct rtnl_link_stats64 temp;
+ const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
+
+ seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
+ "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
+ dev->name, stats->rx_bytes, stats->rx_packets,
+ stats->rx_errors,
+ stats->rx_dropped + stats->rx_missed_errors,
+ stats->rx_fifo_errors,
+ stats->rx_length_errors + stats->rx_over_errors +
+ stats->rx_crc_errors + stats->rx_frame_errors,
+ stats->rx_compressed, stats->multicast,
+ stats->tx_bytes, stats->tx_packets,
+ stats->tx_errors, stats->tx_dropped,
+ stats->tx_fifo_errors, stats->collisions,
+ stats->tx_carrier_errors +
+ stats->tx_aborted_errors +
+ stats->tx_window_errors +
+ stats->tx_heartbeat_errors,
+ stats->tx_compressed);
+}
+
+/*
+ * Called from the PROCfs module. This now uses the new arbitrary sized
+ * /proc/net interface to create /proc/net/dev
+ */
+static int dev_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "Inter-| Receive "
+ " | Transmit\n"
+ " face |bytes packets errs drop fifo frame "
+ "compressed multicast|bytes packets errs "
+ "drop fifo colls carrier compressed\n");
+ else
+ dev_seq_printf_stats(seq, v);
+ return 0;
+}
+
+static struct softnet_data *softnet_get_online(loff_t *pos)
+{
+ struct softnet_data *sd = NULL;
+
+ while (*pos < nr_cpu_ids)
+ if (cpu_online(*pos)) {
+ sd = &per_cpu(softnet_data, *pos);
+ break;
+ } else
+ ++*pos;
+ return sd;
+}
+
+static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return softnet_get_online(pos);
+}
+
+static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return softnet_get_online(pos);
+}
+
+static void softnet_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int softnet_seq_show(struct seq_file *seq, void *v)
+{
+ struct softnet_data *sd = v;
+ unsigned int flow_limit_count = 0;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+ struct sd_flow_limit *fl;
+
+ rcu_read_lock();
+ fl = rcu_dereference(sd->flow_limit);
+ if (fl)
+ flow_limit_count = fl->count;
+ rcu_read_unlock();
+#endif
+
+ seq_printf(seq,
+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ sd->processed, sd->dropped, sd->time_squeeze, 0,
+ 0, 0, 0, 0, /* was fastroute */
+ 0, /* was cpu_collision */
+ sd->received_rps, flow_limit_count);
+ return 0;
+}
+
+static const struct seq_operations dev_seq_ops = {
+ .start = dev_seq_start,
+ .next = dev_seq_next,
+ .stop = dev_seq_stop,
+ .show = dev_seq_show,
+};
+
+static const struct seq_operations softnet_seq_ops = {
+ .start = softnet_seq_start,
+ .next = softnet_seq_next,
+ .stop = softnet_seq_stop,
+ .show = softnet_seq_show,
+};
+
+static void *ptype_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct list_head *ptype_list = NULL;
+ struct packet_type *pt = NULL;
+ struct net_device *dev;
+ loff_t i = 0;
+ int t;
+
+ for_each_netdev_rcu(seq_file_net(seq), dev) {
+ ptype_list = &dev->ptype_all;
+ list_for_each_entry_rcu(pt, ptype_list, list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+ }
+
+ list_for_each_entry_rcu(pt, &ptype_all, list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+
+ for (t = 0; t < PTYPE_HASH_SIZE; t++) {
+ list_for_each_entry_rcu(pt, &ptype_base[t], list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+ }
+ return NULL;
+}
+
+static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return *pos ? ptype_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct net_device *dev;
+ struct packet_type *pt;
+ struct list_head *nxt;
+ int hash;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ptype_get_idx(seq, 0);
+
+ pt = v;
+ nxt = pt->list.next;
+ if (pt->dev) {
+ if (nxt != &pt->dev->ptype_all)
+ goto found;
+
+ dev = pt->dev;
+ for_each_netdev_continue_rcu(seq_file_net(seq), dev) {
+ if (!list_empty(&dev->ptype_all)) {
+ nxt = dev->ptype_all.next;
+ goto found;
+ }
+ }
+
+ nxt = ptype_all.next;
+ goto ptype_all;
+ }
+
+ if (pt->type == htons(ETH_P_ALL)) {
+ptype_all:
+ if (nxt != &ptype_all)
+ goto found;
+ hash = 0;
+ nxt = ptype_base[0].next;
+ } else
+ hash = ntohs(pt->type) & PTYPE_HASH_MASK;
+
+ while (nxt == &ptype_base[hash]) {
+ if (++hash >= PTYPE_HASH_SIZE)
+ return NULL;
+ nxt = ptype_base[hash].next;
+ }
+found:
+ return list_entry(nxt, struct packet_type, list);
+}
+
+static void ptype_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int ptype_seq_show(struct seq_file *seq, void *v)
+{
+ struct packet_type *pt = v;
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "Type Device Function\n");
+ else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) &&
+ (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) {
+ if (pt->type == htons(ETH_P_ALL))
+ seq_puts(seq, "ALL ");
+ else
+ seq_printf(seq, "%04x", ntohs(pt->type));
+
+ seq_printf(seq, " %-8s %pf\n",
+ pt->dev ? pt->dev->name : "", pt->func);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations ptype_seq_ops = {
+ .start = ptype_seq_start,
+ .next = ptype_seq_next,
+ .stop = ptype_seq_stop,
+ .show = ptype_seq_show,
+};
+
+static int __net_init dev_proc_net_init(struct net *net)
+{
+ int rc = -ENOMEM;
+
+ if (!proc_create_net("dev", 0444, net->proc_net, &dev_seq_ops,
+ sizeof(struct seq_net_private)))
+ goto out;
+ if (!proc_create_seq("softnet_stat", 0444, net->proc_net,
+ &softnet_seq_ops))
+ goto out_dev;
+ if (!proc_create_net("ptype", 0444, net->proc_net, &ptype_seq_ops,
+ sizeof(struct seq_net_private)))
+ goto out_softnet;
+
+ if (wext_proc_init(net))
+ goto out_ptype;
+ rc = 0;
+out:
+ return rc;
+out_ptype:
+ remove_proc_entry("ptype", net->proc_net);
+out_softnet:
+ remove_proc_entry("softnet_stat", net->proc_net);
+out_dev:
+ remove_proc_entry("dev", net->proc_net);
+ goto out;
+}
+
+static void __net_exit dev_proc_net_exit(struct net *net)
+{
+ wext_proc_exit(net);
+
+ remove_proc_entry("ptype", net->proc_net);
+ remove_proc_entry("softnet_stat", net->proc_net);
+ remove_proc_entry("dev", net->proc_net);
+}
+
+static struct pernet_operations __net_initdata dev_proc_ops = {
+ .init = dev_proc_net_init,
+ .exit = dev_proc_net_exit,
+};
+
+static int dev_mc_seq_show(struct seq_file *seq, void *v)
+{
+ struct netdev_hw_addr *ha;
+ struct net_device *dev = v;
+
+ if (v == SEQ_START_TOKEN)
+ return 0;
+
+ netif_addr_lock_bh(dev);
+ netdev_for_each_mc_addr(ha, dev) {
+ seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n",
+ dev->ifindex, dev->name,
+ ha->refcount, ha->global_use,
+ (int)dev->addr_len, ha->addr);
+ }
+ netif_addr_unlock_bh(dev);
+ return 0;
+}
+
+static const struct seq_operations dev_mc_seq_ops = {
+ .start = dev_seq_start,
+ .next = dev_seq_next,
+ .stop = dev_seq_stop,
+ .show = dev_mc_seq_show,
+};
+
+static int __net_init dev_mc_net_init(struct net *net)
+{
+ if (!proc_create_net("dev_mcast", 0, net->proc_net, &dev_mc_seq_ops,
+ sizeof(struct seq_net_private)))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit dev_mc_net_exit(struct net *net)
+{
+ remove_proc_entry("dev_mcast", net->proc_net);
+}
+
+static struct pernet_operations __net_initdata dev_mc_net_ops = {
+ .init = dev_mc_net_init,
+ .exit = dev_mc_net_exit,
+};
+
+int __init dev_proc_init(void)
+{
+ int ret = register_pernet_subsys(&dev_proc_ops);
+ if (!ret)
+ return register_pernet_subsys(&dev_mc_net_ops);
+ return ret;
+}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
new file mode 100644
index 000000000..7a11b2d90
--- /dev/null
+++ b/net/core/net-sysfs.c
@@ -0,0 +1,1842 @@
+/*
+ * net-sysfs.c - network device class and attributes
+ *
+ * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <net/switchdev.h>
+#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <linux/sched/signal.h>
+#include <linux/nsproxy.h>
+#include <net/sock.h>
+#include <net/net_namespace.h>
+#include <linux/rtnetlink.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/jiffies.h>
+#include <linux/pm_runtime.h>
+#include <linux/of.h>
+#include <linux/of_net.h>
+#include <linux/cpu.h>
+
+#include "net-sysfs.h"
+
+#ifdef CONFIG_SYSFS
+static const char fmt_hex[] = "%#x\n";
+static const char fmt_dec[] = "%d\n";
+static const char fmt_ulong[] = "%lu\n";
+static const char fmt_u64[] = "%llu\n";
+
+static inline int dev_isalive(const struct net_device *dev)
+{
+ return dev->reg_state <= NETREG_REGISTERED;
+}
+
+/* use same locking rules as GIF* ioctl's */
+static ssize_t netdev_show(const struct device *dev,
+ struct device_attribute *attr, char *buf,
+ ssize_t (*format)(const struct net_device *, char *))
+{
+ struct net_device *ndev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ read_lock(&dev_base_lock);
+ if (dev_isalive(ndev))
+ ret = (*format)(ndev, buf);
+ read_unlock(&dev_base_lock);
+
+ return ret;
+}
+
+/* generate a show function for simple field */
+#define NETDEVICE_SHOW(field, format_string) \
+static ssize_t format_##field(const struct net_device *dev, char *buf) \
+{ \
+ return sprintf(buf, format_string, dev->field); \
+} \
+static ssize_t field##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ return netdev_show(dev, attr, buf, format_##field); \
+} \
+
+#define NETDEVICE_SHOW_RO(field, format_string) \
+NETDEVICE_SHOW(field, format_string); \
+static DEVICE_ATTR_RO(field)
+
+#define NETDEVICE_SHOW_RW(field, format_string) \
+NETDEVICE_SHOW(field, format_string); \
+static DEVICE_ATTR_RW(field)
+
+/* use same locking and permission rules as SIF* ioctl's */
+static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len,
+ int (*set)(struct net_device *, unsigned long))
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
+ unsigned long new;
+ int ret = -EINVAL;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = kstrtoul(buf, 0, &new);
+ if (ret)
+ goto err;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev)) {
+ ret = (*set)(netdev, new);
+ if (ret == 0)
+ ret = len;
+ }
+ rtnl_unlock();
+ err:
+ return ret;
+}
+
+NETDEVICE_SHOW_RO(dev_id, fmt_hex);
+NETDEVICE_SHOW_RO(dev_port, fmt_dec);
+NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec);
+NETDEVICE_SHOW_RO(addr_len, fmt_dec);
+NETDEVICE_SHOW_RO(ifindex, fmt_dec);
+NETDEVICE_SHOW_RO(type, fmt_dec);
+NETDEVICE_SHOW_RO(link_mode, fmt_dec);
+
+static ssize_t iflink_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *ndev = to_net_dev(dev);
+
+ return sprintf(buf, fmt_dec, dev_get_iflink(ndev));
+}
+static DEVICE_ATTR_RO(iflink);
+
+static ssize_t format_name_assign_type(const struct net_device *dev, char *buf)
+{
+ return sprintf(buf, fmt_dec, dev->name_assign_type);
+}
+
+static ssize_t name_assign_type_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *ndev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (ndev->name_assign_type != NET_NAME_UNKNOWN)
+ ret = netdev_show(dev, attr, buf, format_name_assign_type);
+
+ return ret;
+}
+static DEVICE_ATTR_RO(name_assign_type);
+
+/* use same locking rules as GIFHWADDR ioctl's */
+static ssize_t address_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *ndev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ read_lock(&dev_base_lock);
+ if (dev_isalive(ndev))
+ ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len);
+ read_unlock(&dev_base_lock);
+ return ret;
+}
+static DEVICE_ATTR_RO(address);
+
+static ssize_t broadcast_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *ndev = to_net_dev(dev);
+
+ if (dev_isalive(ndev))
+ return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len);
+ return -EINVAL;
+}
+static DEVICE_ATTR_RO(broadcast);
+
+static int change_carrier(struct net_device *dev, unsigned long new_carrier)
+{
+ if (!netif_running(dev))
+ return -EINVAL;
+ return dev_change_carrier(dev, (bool)new_carrier);
+}
+
+static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_carrier);
+}
+
+static ssize_t carrier_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ if (netif_running(netdev))
+ return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev));
+
+ return -EINVAL;
+}
+static DEVICE_ATTR_RW(carrier);
+
+static ssize_t speed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ int ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (netif_running(netdev) && netif_device_present(netdev)) {
+ struct ethtool_link_ksettings cmd;
+
+ if (!__ethtool_get_link_ksettings(netdev, &cmd))
+ ret = sprintf(buf, fmt_dec, cmd.base.speed);
+ }
+ rtnl_unlock();
+ return ret;
+}
+static DEVICE_ATTR_RO(speed);
+
+static ssize_t duplex_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ int ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (netif_running(netdev)) {
+ struct ethtool_link_ksettings cmd;
+
+ if (!__ethtool_get_link_ksettings(netdev, &cmd)) {
+ const char *duplex;
+
+ switch (cmd.base.duplex) {
+ case DUPLEX_HALF:
+ duplex = "half";
+ break;
+ case DUPLEX_FULL:
+ duplex = "full";
+ break;
+ default:
+ duplex = "unknown";
+ break;
+ }
+ ret = sprintf(buf, "%s\n", duplex);
+ }
+ }
+ rtnl_unlock();
+ return ret;
+}
+static DEVICE_ATTR_RO(duplex);
+
+static ssize_t dormant_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ if (netif_running(netdev))
+ return sprintf(buf, fmt_dec, !!netif_dormant(netdev));
+
+ return -EINVAL;
+}
+static DEVICE_ATTR_RO(dormant);
+
+static const char *const operstates[] = {
+ "unknown",
+ "notpresent", /* currently unused */
+ "down",
+ "lowerlayerdown",
+ "testing", /* currently unused */
+ "dormant",
+ "up"
+};
+
+static ssize_t operstate_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ const struct net_device *netdev = to_net_dev(dev);
+ unsigned char operstate;
+
+ read_lock(&dev_base_lock);
+ operstate = netdev->operstate;
+ if (!netif_running(netdev))
+ operstate = IF_OPER_DOWN;
+ read_unlock(&dev_base_lock);
+
+ if (operstate >= ARRAY_SIZE(operstates))
+ return -EINVAL; /* should not happen */
+
+ return sprintf(buf, "%s\n", operstates[operstate]);
+}
+static DEVICE_ATTR_RO(operstate);
+
+static ssize_t carrier_changes_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ return sprintf(buf, fmt_dec,
+ atomic_read(&netdev->carrier_up_count) +
+ atomic_read(&netdev->carrier_down_count));
+}
+static DEVICE_ATTR_RO(carrier_changes);
+
+static ssize_t carrier_up_count_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count));
+}
+static DEVICE_ATTR_RO(carrier_up_count);
+
+static ssize_t carrier_down_count_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_down_count));
+}
+static DEVICE_ATTR_RO(carrier_down_count);
+
+/* read-write attributes */
+
+static int change_mtu(struct net_device *dev, unsigned long new_mtu)
+{
+ return dev_set_mtu(dev, (int)new_mtu);
+}
+
+static ssize_t mtu_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_mtu);
+}
+NETDEVICE_SHOW_RW(mtu, fmt_dec);
+
+static int change_flags(struct net_device *dev, unsigned long new_flags)
+{
+ return dev_change_flags(dev, (unsigned int)new_flags);
+}
+
+static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_flags);
+}
+NETDEVICE_SHOW_RW(flags, fmt_hex);
+
+static ssize_t tx_queue_len_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return netdev_store(dev, attr, buf, len, dev_change_tx_queue_len);
+}
+NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
+
+static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
+{
+ dev->gro_flush_timeout = val;
+ return 0;
+}
+
+static ssize_t gro_flush_timeout_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return netdev_store(dev, attr, buf, len, change_gro_flush_timeout);
+}
+NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
+
+static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
+ size_t count = len;
+ ssize_t ret = 0;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ /* ignore trailing newline */
+ if (len > 0 && buf[len - 1] == '\n')
+ --count;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev)) {
+ ret = dev_set_alias(netdev, buf, count);
+ if (ret < 0)
+ goto err;
+ ret = len;
+ netdev_state_change(netdev);
+ }
+err:
+ rtnl_unlock();
+
+ return ret;
+}
+
+static ssize_t ifalias_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ const struct net_device *netdev = to_net_dev(dev);
+ char tmp[IFALIASZ];
+ ssize_t ret = 0;
+
+ ret = dev_get_alias(netdev, tmp, sizeof(tmp));
+ if (ret > 0)
+ ret = sprintf(buf, "%s\n", tmp);
+ return ret;
+}
+static DEVICE_ATTR_RW(ifalias);
+
+static int change_group(struct net_device *dev, unsigned long new_group)
+{
+ dev_set_group(dev, (int)new_group);
+ return 0;
+}
+
+static ssize_t group_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_group);
+}
+NETDEVICE_SHOW(group, fmt_dec);
+static DEVICE_ATTR(netdev_group, 0644, group_show, group_store);
+
+static int change_proto_down(struct net_device *dev, unsigned long proto_down)
+{
+ return dev_change_proto_down(dev, (bool)proto_down);
+}
+
+static ssize_t proto_down_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_proto_down);
+}
+NETDEVICE_SHOW_RW(proto_down, fmt_dec);
+
+static ssize_t phys_port_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev)) {
+ struct netdev_phys_item_id ppid;
+
+ ret = dev_get_phys_port_id(netdev, &ppid);
+ if (!ret)
+ ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+static DEVICE_ATTR_RO(phys_port_id);
+
+static ssize_t phys_port_name_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev)) {
+ char name[IFNAMSIZ];
+
+ ret = dev_get_phys_port_name(netdev, name, sizeof(name));
+ if (!ret)
+ ret = sprintf(buf, "%s\n", name);
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+static DEVICE_ATTR_RO(phys_port_name);
+
+static ssize_t phys_switch_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev)) {
+ struct switchdev_attr attr = {
+ .orig_dev = netdev,
+ .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
+ .flags = SWITCHDEV_F_NO_RECURSE,
+ };
+
+ ret = switchdev_port_attr_get(netdev, &attr);
+ if (!ret)
+ ret = sprintf(buf, "%*phN\n", attr.u.ppid.id_len,
+ attr.u.ppid.id);
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+static DEVICE_ATTR_RO(phys_switch_id);
+
+static struct attribute *net_class_attrs[] __ro_after_init = {
+ &dev_attr_netdev_group.attr,
+ &dev_attr_type.attr,
+ &dev_attr_dev_id.attr,
+ &dev_attr_dev_port.attr,
+ &dev_attr_iflink.attr,
+ &dev_attr_ifindex.attr,
+ &dev_attr_name_assign_type.attr,
+ &dev_attr_addr_assign_type.attr,
+ &dev_attr_addr_len.attr,
+ &dev_attr_link_mode.attr,
+ &dev_attr_address.attr,
+ &dev_attr_broadcast.attr,
+ &dev_attr_speed.attr,
+ &dev_attr_duplex.attr,
+ &dev_attr_dormant.attr,
+ &dev_attr_operstate.attr,
+ &dev_attr_carrier_changes.attr,
+ &dev_attr_ifalias.attr,
+ &dev_attr_carrier.attr,
+ &dev_attr_mtu.attr,
+ &dev_attr_flags.attr,
+ &dev_attr_tx_queue_len.attr,
+ &dev_attr_gro_flush_timeout.attr,
+ &dev_attr_phys_port_id.attr,
+ &dev_attr_phys_port_name.attr,
+ &dev_attr_phys_switch_id.attr,
+ &dev_attr_proto_down.attr,
+ &dev_attr_carrier_up_count.attr,
+ &dev_attr_carrier_down_count.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(net_class);
+
+/* Show a given an attribute in the statistics group */
+static ssize_t netstat_show(const struct device *d,
+ struct device_attribute *attr, char *buf,
+ unsigned long offset)
+{
+ struct net_device *dev = to_net_dev(d);
+ ssize_t ret = -EINVAL;
+
+ WARN_ON(offset > sizeof(struct rtnl_link_stats64) ||
+ offset % sizeof(u64) != 0);
+
+ read_lock(&dev_base_lock);
+ if (dev_isalive(dev)) {
+ struct rtnl_link_stats64 temp;
+ const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
+
+ ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset));
+ }
+ read_unlock(&dev_base_lock);
+ return ret;
+}
+
+/* generate a read-only statistics attribute */
+#define NETSTAT_ENTRY(name) \
+static ssize_t name##_show(struct device *d, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ return netstat_show(d, attr, buf, \
+ offsetof(struct rtnl_link_stats64, name)); \
+} \
+static DEVICE_ATTR_RO(name)
+
+NETSTAT_ENTRY(rx_packets);
+NETSTAT_ENTRY(tx_packets);
+NETSTAT_ENTRY(rx_bytes);
+NETSTAT_ENTRY(tx_bytes);
+NETSTAT_ENTRY(rx_errors);
+NETSTAT_ENTRY(tx_errors);
+NETSTAT_ENTRY(rx_dropped);
+NETSTAT_ENTRY(tx_dropped);
+NETSTAT_ENTRY(multicast);
+NETSTAT_ENTRY(collisions);
+NETSTAT_ENTRY(rx_length_errors);
+NETSTAT_ENTRY(rx_over_errors);
+NETSTAT_ENTRY(rx_crc_errors);
+NETSTAT_ENTRY(rx_frame_errors);
+NETSTAT_ENTRY(rx_fifo_errors);
+NETSTAT_ENTRY(rx_missed_errors);
+NETSTAT_ENTRY(tx_aborted_errors);
+NETSTAT_ENTRY(tx_carrier_errors);
+NETSTAT_ENTRY(tx_fifo_errors);
+NETSTAT_ENTRY(tx_heartbeat_errors);
+NETSTAT_ENTRY(tx_window_errors);
+NETSTAT_ENTRY(rx_compressed);
+NETSTAT_ENTRY(tx_compressed);
+NETSTAT_ENTRY(rx_nohandler);
+
+static struct attribute *netstat_attrs[] __ro_after_init = {
+ &dev_attr_rx_packets.attr,
+ &dev_attr_tx_packets.attr,
+ &dev_attr_rx_bytes.attr,
+ &dev_attr_tx_bytes.attr,
+ &dev_attr_rx_errors.attr,
+ &dev_attr_tx_errors.attr,
+ &dev_attr_rx_dropped.attr,
+ &dev_attr_tx_dropped.attr,
+ &dev_attr_multicast.attr,
+ &dev_attr_collisions.attr,
+ &dev_attr_rx_length_errors.attr,
+ &dev_attr_rx_over_errors.attr,
+ &dev_attr_rx_crc_errors.attr,
+ &dev_attr_rx_frame_errors.attr,
+ &dev_attr_rx_fifo_errors.attr,
+ &dev_attr_rx_missed_errors.attr,
+ &dev_attr_tx_aborted_errors.attr,
+ &dev_attr_tx_carrier_errors.attr,
+ &dev_attr_tx_fifo_errors.attr,
+ &dev_attr_tx_heartbeat_errors.attr,
+ &dev_attr_tx_window_errors.attr,
+ &dev_attr_rx_compressed.attr,
+ &dev_attr_tx_compressed.attr,
+ &dev_attr_rx_nohandler.attr,
+ NULL
+};
+
+static const struct attribute_group netstat_group = {
+ .name = "statistics",
+ .attrs = netstat_attrs,
+};
+
+#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
+static struct attribute *wireless_attrs[] = {
+ NULL
+};
+
+static const struct attribute_group wireless_group = {
+ .name = "wireless",
+ .attrs = wireless_attrs,
+};
+#endif
+
+#else /* CONFIG_SYSFS */
+#define net_class_groups NULL
+#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_SYSFS
+#define to_rx_queue_attr(_attr) \
+ container_of(_attr, struct rx_queue_attribute, attr)
+
+#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj)
+
+static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ return attribute->show(queue, buf);
+}
+
+static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ return attribute->store(queue, buf, count);
+}
+
+static const struct sysfs_ops rx_queue_sysfs_ops = {
+ .show = rx_queue_attr_show,
+ .store = rx_queue_attr_store,
+};
+
+#ifdef CONFIG_RPS
+static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf)
+{
+ struct rps_map *map;
+ cpumask_var_t mask;
+ int i, len;
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ rcu_read_lock();
+ map = rcu_dereference(queue->rps_map);
+ if (map)
+ for (i = 0; i < map->len; i++)
+ cpumask_set_cpu(map->cpus[i], mask);
+
+ len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
+ rcu_read_unlock();
+ free_cpumask_var(mask);
+
+ return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t store_rps_map(struct netdev_rx_queue *queue,
+ const char *buf, size_t len)
+{
+ struct rps_map *old_map, *map;
+ cpumask_var_t mask;
+ int err, cpu, i;
+ static DEFINE_MUTEX(rps_map_mutex);
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+ if (err) {
+ free_cpumask_var(mask);
+ return err;
+ }
+
+ map = kzalloc(max_t(unsigned int,
+ RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
+ GFP_KERNEL);
+ if (!map) {
+ free_cpumask_var(mask);
+ return -ENOMEM;
+ }
+
+ i = 0;
+ for_each_cpu_and(cpu, mask, cpu_online_mask)
+ map->cpus[i++] = cpu;
+
+ if (i) {
+ map->len = i;
+ } else {
+ kfree(map);
+ map = NULL;
+ }
+
+ mutex_lock(&rps_map_mutex);
+ old_map = rcu_dereference_protected(queue->rps_map,
+ mutex_is_locked(&rps_map_mutex));
+ rcu_assign_pointer(queue->rps_map, map);
+
+ if (map)
+ static_key_slow_inc(&rps_needed);
+ if (old_map)
+ static_key_slow_dec(&rps_needed);
+
+ mutex_unlock(&rps_map_mutex);
+
+ if (old_map)
+ kfree_rcu(old_map, rcu);
+
+ free_cpumask_var(mask);
+ return len;
+}
+
+static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+ char *buf)
+{
+ struct rps_dev_flow_table *flow_table;
+ unsigned long val = 0;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(queue->rps_flow_table);
+ if (flow_table)
+ val = (unsigned long)flow_table->mask + 1;
+ rcu_read_unlock();
+
+ return sprintf(buf, "%lu\n", val);
+}
+
+static void rps_dev_flow_table_release(struct rcu_head *rcu)
+{
+ struct rps_dev_flow_table *table = container_of(rcu,
+ struct rps_dev_flow_table, rcu);
+ vfree(table);
+}
+
+static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+ const char *buf, size_t len)
+{
+ unsigned long mask, count;
+ struct rps_dev_flow_table *table, *old_table;
+ static DEFINE_SPINLOCK(rps_dev_flow_lock);
+ int rc;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ rc = kstrtoul(buf, 0, &count);
+ if (rc < 0)
+ return rc;
+
+ if (count) {
+ mask = count - 1;
+ /* mask = roundup_pow_of_two(count) - 1;
+ * without overflows...
+ */
+ while ((mask | (mask >> 1)) != mask)
+ mask |= (mask >> 1);
+ /* On 64 bit arches, must check mask fits in table->mask (u32),
+ * and on 32bit arches, must check
+ * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow.
+ */
+#if BITS_PER_LONG > 32
+ if (mask > (unsigned long)(u32)mask)
+ return -EINVAL;
+#else
+ if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1))
+ / sizeof(struct rps_dev_flow)) {
+ /* Enforce a limit to prevent overflow */
+ return -EINVAL;
+ }
+#endif
+ table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));
+ if (!table)
+ return -ENOMEM;
+
+ table->mask = mask;
+ for (count = 0; count <= mask; count++)
+ table->flows[count].cpu = RPS_NO_CPU;
+ } else {
+ table = NULL;
+ }
+
+ spin_lock(&rps_dev_flow_lock);
+ old_table = rcu_dereference_protected(queue->rps_flow_table,
+ lockdep_is_held(&rps_dev_flow_lock));
+ rcu_assign_pointer(queue->rps_flow_table, table);
+ spin_unlock(&rps_dev_flow_lock);
+
+ if (old_table)
+ call_rcu(&old_table->rcu, rps_dev_flow_table_release);
+
+ return len;
+}
+
+static struct rx_queue_attribute rps_cpus_attribute __ro_after_init
+ = __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map);
+
+static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init
+ = __ATTR(rps_flow_cnt, 0644,
+ show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+#endif /* CONFIG_RPS */
+
+static struct attribute *rx_queue_default_attrs[] __ro_after_init = {
+#ifdef CONFIG_RPS
+ &rps_cpus_attribute.attr,
+ &rps_dev_flow_table_cnt_attribute.attr,
+#endif
+ NULL
+};
+
+static void rx_queue_release(struct kobject *kobj)
+{
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+#ifdef CONFIG_RPS
+ struct rps_map *map;
+ struct rps_dev_flow_table *flow_table;
+
+ map = rcu_dereference_protected(queue->rps_map, 1);
+ if (map) {
+ RCU_INIT_POINTER(queue->rps_map, NULL);
+ kfree_rcu(map, rcu);
+ }
+
+ flow_table = rcu_dereference_protected(queue->rps_flow_table, 1);
+ if (flow_table) {
+ RCU_INIT_POINTER(queue->rps_flow_table, NULL);
+ call_rcu(&flow_table->rcu, rps_dev_flow_table_release);
+ }
+#endif
+
+ memset(kobj, 0, sizeof(*kobj));
+ dev_put(queue->dev);
+}
+
+static const void *rx_queue_namespace(struct kobject *kobj)
+{
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+ struct device *dev = &queue->dev->dev;
+ const void *ns = NULL;
+
+ if (dev->class && dev->class->ns_type)
+ ns = dev->class->namespace(dev);
+
+ return ns;
+}
+
+static void rx_queue_get_ownership(struct kobject *kobj,
+ kuid_t *uid, kgid_t *gid)
+{
+ const struct net *net = rx_queue_namespace(kobj);
+
+ net_ns_get_ownership(net, uid, gid);
+}
+
+static struct kobj_type rx_queue_ktype __ro_after_init = {
+ .sysfs_ops = &rx_queue_sysfs_ops,
+ .release = rx_queue_release,
+ .default_attrs = rx_queue_default_attrs,
+ .namespace = rx_queue_namespace,
+ .get_ownership = rx_queue_get_ownership,
+};
+
+static int rx_queue_add_kobject(struct net_device *dev, int index)
+{
+ struct netdev_rx_queue *queue = dev->_rx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error = 0;
+
+ /* Kobject_put later will trigger rx_queue_release call which
+ * decreases dev refcount: Take that reference here
+ */
+ dev_hold(queue->dev);
+
+ kobj->kset = dev->queues_kset;
+ error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
+ "rx-%u", index);
+ if (error)
+ goto err;
+
+ if (dev->sysfs_rx_queue_group) {
+ error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group);
+ if (error)
+ goto err;
+ }
+
+ kobject_uevent(kobj, KOBJ_ADD);
+
+ return error;
+
+err:
+ kobject_put(kobj);
+ return error;
+}
+#endif /* CONFIG_SYSFS */
+
+int
+net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
+{
+#ifdef CONFIG_SYSFS
+ int i;
+ int error = 0;
+
+#ifndef CONFIG_RPS
+ if (!dev->sysfs_rx_queue_group)
+ return 0;
+#endif
+ for (i = old_num; i < new_num; i++) {
+ error = rx_queue_add_kobject(dev, i);
+ if (error) {
+ new_num = old_num;
+ break;
+ }
+ }
+
+ while (--i >= new_num) {
+ struct kobject *kobj = &dev->_rx[i].kobj;
+
+ if (!refcount_read(&dev_net(dev)->count))
+ kobj->uevent_suppress = 1;
+ if (dev->sysfs_rx_queue_group)
+ sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
+ kobject_put(kobj);
+ }
+
+ return error;
+#else
+ return 0;
+#endif
+}
+
+#ifdef CONFIG_SYSFS
+/*
+ * netdev_queue sysfs structures and functions.
+ */
+struct netdev_queue_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct netdev_queue *queue, char *buf);
+ ssize_t (*store)(struct netdev_queue *queue,
+ const char *buf, size_t len);
+};
+#define to_netdev_queue_attr(_attr) \
+ container_of(_attr, struct netdev_queue_attribute, attr)
+
+#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj)
+
+static ssize_t netdev_queue_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ const struct netdev_queue_attribute *attribute
+ = to_netdev_queue_attr(attr);
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ return attribute->show(queue, buf);
+}
+
+static ssize_t netdev_queue_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t count)
+{
+ const struct netdev_queue_attribute *attribute
+ = to_netdev_queue_attr(attr);
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ return attribute->store(queue, buf, count);
+}
+
+static const struct sysfs_ops netdev_queue_sysfs_ops = {
+ .show = netdev_queue_attr_show,
+ .store = netdev_queue_attr_store,
+};
+
+static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf)
+{
+ unsigned long trans_timeout;
+
+ spin_lock_irq(&queue->_xmit_lock);
+ trans_timeout = queue->trans_timeout;
+ spin_unlock_irq(&queue->_xmit_lock);
+
+ return sprintf(buf, fmt_ulong, trans_timeout);
+}
+
+static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
+{
+ struct net_device *dev = queue->dev;
+ unsigned int i;
+
+ i = queue - dev->_tx;
+ BUG_ON(i >= dev->num_tx_queues);
+
+ return i;
+}
+
+static ssize_t traffic_class_show(struct netdev_queue *queue,
+ char *buf)
+{
+ struct net_device *dev = queue->dev;
+ int index;
+ int tc;
+
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
+
+ index = get_netdev_queue_index(queue);
+
+ /* If queue belongs to subordinate dev use its TC mapping */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0)
+ return -EINVAL;
+
+ /* We can report the traffic class one of two ways:
+ * Subordinate device traffic classes are reported with the traffic
+ * class first, and then the subordinate class so for example TC0 on
+ * subordinate device 2 will be reported as "0-2". If the queue
+ * belongs to the root device it will be reported with just the
+ * traffic class, so just "0" for TC 0 for example.
+ */
+ return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) :
+ sprintf(buf, "%u\n", tc);
+}
+
+#ifdef CONFIG_XPS
+static ssize_t tx_maxrate_show(struct netdev_queue *queue,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", queue->tx_maxrate);
+}
+
+static ssize_t tx_maxrate_store(struct netdev_queue *queue,
+ const char *buf, size_t len)
+{
+ struct net_device *dev = queue->dev;
+ int err, index = get_netdev_queue_index(queue);
+ u32 rate = 0;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ err = kstrtou32(buf, 10, &rate);
+ if (err < 0)
+ return err;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ err = -EOPNOTSUPP;
+ if (dev->netdev_ops->ndo_set_tx_maxrate)
+ err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate);
+
+ rtnl_unlock();
+ if (!err) {
+ queue->tx_maxrate = rate;
+ return len;
+ }
+ return err;
+}
+
+static struct netdev_queue_attribute queue_tx_maxrate __ro_after_init
+ = __ATTR_RW(tx_maxrate);
+#endif
+
+static struct netdev_queue_attribute queue_trans_timeout __ro_after_init
+ = __ATTR_RO(tx_timeout);
+
+static struct netdev_queue_attribute queue_traffic_class __ro_after_init
+ = __ATTR_RO(traffic_class);
+
+#ifdef CONFIG_BQL
+/*
+ * Byte queue limits sysfs structures and functions.
+ */
+static ssize_t bql_show(char *buf, unsigned int value)
+{
+ return sprintf(buf, "%u\n", value);
+}
+
+static ssize_t bql_set(const char *buf, const size_t count,
+ unsigned int *pvalue)
+{
+ unsigned int value;
+ int err;
+
+ if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) {
+ value = DQL_MAX_LIMIT;
+ } else {
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+ if (value > DQL_MAX_LIMIT)
+ return -EINVAL;
+ }
+
+ *pvalue = value;
+
+ return count;
+}
+
+static ssize_t bql_show_hold_time(struct netdev_queue *queue,
+ char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
+}
+
+static ssize_t bql_set_hold_time(struct netdev_queue *queue,
+ const char *buf, size_t len)
+{
+ struct dql *dql = &queue->dql;
+ unsigned int value;
+ int err;
+
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+
+ dql->slack_hold_time = msecs_to_jiffies(value);
+
+ return len;
+}
+
+static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init
+ = __ATTR(hold_time, 0644,
+ bql_show_hold_time, bql_set_hold_time);
+
+static ssize_t bql_show_inflight(struct netdev_queue *queue,
+ char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed);
+}
+
+static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init =
+ __ATTR(inflight, 0444, bql_show_inflight, NULL);
+
+#define BQL_ATTR(NAME, FIELD) \
+static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \
+ char *buf) \
+{ \
+ return bql_show(buf, queue->dql.FIELD); \
+} \
+ \
+static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \
+ const char *buf, size_t len) \
+{ \
+ return bql_set(buf, len, &queue->dql.FIELD); \
+} \
+ \
+static struct netdev_queue_attribute bql_ ## NAME ## _attribute __ro_after_init \
+ = __ATTR(NAME, 0644, \
+ bql_show_ ## NAME, bql_set_ ## NAME)
+
+BQL_ATTR(limit, limit);
+BQL_ATTR(limit_max, max_limit);
+BQL_ATTR(limit_min, min_limit);
+
+static struct attribute *dql_attrs[] __ro_after_init = {
+ &bql_limit_attribute.attr,
+ &bql_limit_max_attribute.attr,
+ &bql_limit_min_attribute.attr,
+ &bql_hold_time_attribute.attr,
+ &bql_inflight_attribute.attr,
+ NULL
+};
+
+static const struct attribute_group dql_group = {
+ .name = "byte_queue_limits",
+ .attrs = dql_attrs,
+};
+#endif /* CONFIG_BQL */
+
+#ifdef CONFIG_XPS
+static ssize_t xps_cpus_show(struct netdev_queue *queue,
+ char *buf)
+{
+ int cpu, len, ret, num_tc = 1, tc = 0;
+ struct net_device *dev = queue->dev;
+ struct xps_dev_maps *dev_maps;
+ cpumask_var_t mask;
+ unsigned long index;
+
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
+
+ index = get_netdev_queue_index(queue);
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev->num_tc) {
+ /* Do not allow XPS on subordinate device directly */
+ num_tc = dev->num_tc;
+ if (num_tc < 0) {
+ ret = -EINVAL;
+ goto err_rtnl_unlock;
+ }
+
+ /* If queue belongs to subordinate dev use its map */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0) {
+ ret = -EINVAL;
+ goto err_rtnl_unlock;
+ }
+ }
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto err_rtnl_unlock;
+ }
+
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_cpus_map);
+ if (dev_maps) {
+ for_each_possible_cpu(cpu) {
+ int i, tci = cpu * num_tc + tc;
+ struct xps_map *map;
+
+ map = rcu_dereference(dev_maps->attr_map[tci]);
+ if (!map)
+ continue;
+
+ for (i = map->len; i--;) {
+ if (map->queues[i] == index) {
+ cpumask_set_cpu(cpu, mask);
+ break;
+ }
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ rtnl_unlock();
+
+ len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
+ free_cpumask_var(mask);
+ return len < PAGE_SIZE ? len : -EINVAL;
+
+err_rtnl_unlock:
+ rtnl_unlock();
+ return ret;
+}
+
+static ssize_t xps_cpus_store(struct netdev_queue *queue,
+ const char *buf, size_t len)
+{
+ struct net_device *dev = queue->dev;
+ unsigned long index;
+ cpumask_var_t mask;
+ int err;
+
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ index = get_netdev_queue_index(queue);
+
+ err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+ if (err) {
+ free_cpumask_var(mask);
+ return err;
+ }
+
+ if (!rtnl_trylock()) {
+ free_cpumask_var(mask);
+ return restart_syscall();
+ }
+
+ err = netif_set_xps_queue(dev, mask, index);
+ rtnl_unlock();
+
+ free_cpumask_var(mask);
+
+ return err ? : len;
+}
+
+static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
+ = __ATTR_RW(xps_cpus);
+
+static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
+{
+ int j, len, ret, num_tc = 1, tc = 0;
+ struct net_device *dev = queue->dev;
+ struct xps_dev_maps *dev_maps;
+ unsigned long *mask, index;
+
+ index = get_netdev_queue_index(queue);
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev->num_tc) {
+ num_tc = dev->num_tc;
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0) {
+ ret = -EINVAL;
+ goto err_rtnl_unlock;
+ }
+ }
+ mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+ GFP_KERNEL);
+ if (!mask) {
+ ret = -ENOMEM;
+ goto err_rtnl_unlock;
+ }
+
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_rxqs_map);
+ if (!dev_maps)
+ goto out_no_maps;
+
+ for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues),
+ j < dev->num_rx_queues;) {
+ int i, tci = j * num_tc + tc;
+ struct xps_map *map;
+
+ map = rcu_dereference(dev_maps->attr_map[tci]);
+ if (!map)
+ continue;
+
+ for (i = map->len; i--;) {
+ if (map->queues[i] == index) {
+ set_bit(j, mask);
+ break;
+ }
+ }
+ }
+out_no_maps:
+ rcu_read_unlock();
+
+ rtnl_unlock();
+
+ len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
+ kfree(mask);
+
+ return len < PAGE_SIZE ? len : -EINVAL;
+
+err_rtnl_unlock:
+ rtnl_unlock();
+ return ret;
+}
+
+static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+ struct net_device *dev = queue->dev;
+ struct net *net = dev_net(dev);
+ unsigned long *mask, index;
+ int err;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+ GFP_KERNEL);
+ if (!mask)
+ return -ENOMEM;
+
+ index = get_netdev_queue_index(queue);
+
+ err = bitmap_parse(buf, len, mask, dev->num_rx_queues);
+ if (err) {
+ kfree(mask);
+ return err;
+ }
+
+ if (!rtnl_trylock()) {
+ bitmap_free(mask);
+ return restart_syscall();
+ }
+
+ cpus_read_lock();
+ err = __netif_set_xps_queue(dev, mask, index, true);
+ cpus_read_unlock();
+
+ rtnl_unlock();
+
+ kfree(mask);
+ return err ? : len;
+}
+
+static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init
+ = __ATTR_RW(xps_rxqs);
+#endif /* CONFIG_XPS */
+
+static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
+ &queue_trans_timeout.attr,
+ &queue_traffic_class.attr,
+#ifdef CONFIG_XPS
+ &xps_cpus_attribute.attr,
+ &xps_rxqs_attribute.attr,
+ &queue_tx_maxrate.attr,
+#endif
+ NULL
+};
+
+static void netdev_queue_release(struct kobject *kobj)
+{
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+
+ memset(kobj, 0, sizeof(*kobj));
+ dev_put(queue->dev);
+}
+
+static const void *netdev_queue_namespace(struct kobject *kobj)
+{
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+ struct device *dev = &queue->dev->dev;
+ const void *ns = NULL;
+
+ if (dev->class && dev->class->ns_type)
+ ns = dev->class->namespace(dev);
+
+ return ns;
+}
+
+static void netdev_queue_get_ownership(struct kobject *kobj,
+ kuid_t *uid, kgid_t *gid)
+{
+ const struct net *net = netdev_queue_namespace(kobj);
+
+ net_ns_get_ownership(net, uid, gid);
+}
+
+static struct kobj_type netdev_queue_ktype __ro_after_init = {
+ .sysfs_ops = &netdev_queue_sysfs_ops,
+ .release = netdev_queue_release,
+ .default_attrs = netdev_queue_default_attrs,
+ .namespace = netdev_queue_namespace,
+ .get_ownership = netdev_queue_get_ownership,
+};
+
+static int netdev_queue_add_kobject(struct net_device *dev, int index)
+{
+ struct netdev_queue *queue = dev->_tx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error = 0;
+
+ /* Kobject_put later will trigger netdev_queue_release call
+ * which decreases dev refcount: Take that reference here
+ */
+ dev_hold(queue->dev);
+
+ kobj->kset = dev->queues_kset;
+ error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
+ "tx-%u", index);
+ if (error)
+ goto err;
+
+#ifdef CONFIG_BQL
+ error = sysfs_create_group(kobj, &dql_group);
+ if (error)
+ goto err;
+#endif
+
+ kobject_uevent(kobj, KOBJ_ADD);
+ return 0;
+
+err:
+ kobject_put(kobj);
+ return error;
+}
+#endif /* CONFIG_SYSFS */
+
+int
+netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
+{
+#ifdef CONFIG_SYSFS
+ int i;
+ int error = 0;
+
+ for (i = old_num; i < new_num; i++) {
+ error = netdev_queue_add_kobject(dev, i);
+ if (error) {
+ new_num = old_num;
+ break;
+ }
+ }
+
+ while (--i >= new_num) {
+ struct netdev_queue *queue = dev->_tx + i;
+
+ if (!refcount_read(&dev_net(dev)->count))
+ queue->kobj.uevent_suppress = 1;
+#ifdef CONFIG_BQL
+ sysfs_remove_group(&queue->kobj, &dql_group);
+#endif
+ kobject_put(&queue->kobj);
+ }
+
+ return error;
+#else
+ return 0;
+#endif /* CONFIG_SYSFS */
+}
+
+static int register_queue_kobjects(struct net_device *dev)
+{
+ int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
+
+#ifdef CONFIG_SYSFS
+ dev->queues_kset = kset_create_and_add("queues",
+ NULL, &dev->dev.kobj);
+ if (!dev->queues_kset)
+ return -ENOMEM;
+ real_rx = dev->real_num_rx_queues;
+#endif
+ real_tx = dev->real_num_tx_queues;
+
+ error = net_rx_queue_update_kobjects(dev, 0, real_rx);
+ if (error)
+ goto error;
+ rxq = real_rx;
+
+ error = netdev_queue_update_kobjects(dev, 0, real_tx);
+ if (error)
+ goto error;
+ txq = real_tx;
+
+ return 0;
+
+error:
+ netdev_queue_update_kobjects(dev, txq, 0);
+ net_rx_queue_update_kobjects(dev, rxq, 0);
+#ifdef CONFIG_SYSFS
+ kset_unregister(dev->queues_kset);
+#endif
+ return error;
+}
+
+static void remove_queue_kobjects(struct net_device *dev)
+{
+ int real_rx = 0, real_tx = 0;
+
+#ifdef CONFIG_SYSFS
+ real_rx = dev->real_num_rx_queues;
+#endif
+ real_tx = dev->real_num_tx_queues;
+
+ net_rx_queue_update_kobjects(dev, real_rx, 0);
+ netdev_queue_update_kobjects(dev, real_tx, 0);
+
+ dev->real_num_rx_queues = 0;
+ dev->real_num_tx_queues = 0;
+#ifdef CONFIG_SYSFS
+ kset_unregister(dev->queues_kset);
+#endif
+}
+
+static bool net_current_may_mount(void)
+{
+ struct net *net = current->nsproxy->net_ns;
+
+ return ns_capable(net->user_ns, CAP_SYS_ADMIN);
+}
+
+static void *net_grab_current_ns(void)
+{
+ struct net *ns = current->nsproxy->net_ns;
+#ifdef CONFIG_NET_NS
+ if (ns)
+ refcount_inc(&ns->passive);
+#endif
+ return ns;
+}
+
+static const void *net_initial_ns(void)
+{
+ return &init_net;
+}
+
+static const void *net_netlink_ns(struct sock *sk)
+{
+ return sock_net(sk);
+}
+
+const struct kobj_ns_type_operations net_ns_type_operations = {
+ .type = KOBJ_NS_TYPE_NET,
+ .current_may_mount = net_current_may_mount,
+ .grab_current_ns = net_grab_current_ns,
+ .netlink_ns = net_netlink_ns,
+ .initial_ns = net_initial_ns,
+ .drop_ns = net_drop_ns,
+};
+EXPORT_SYMBOL_GPL(net_ns_type_operations);
+
+static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)
+{
+ struct net_device *dev = to_net_dev(d);
+ int retval;
+
+ /* pass interface to uevent. */
+ retval = add_uevent_var(env, "INTERFACE=%s", dev->name);
+ if (retval)
+ goto exit;
+
+ /* pass ifindex to uevent.
+ * ifindex is useful as it won't change (interface name may change)
+ * and is what RtNetlink uses natively.
+ */
+ retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex);
+
+exit:
+ return retval;
+}
+
+/*
+ * netdev_release -- destroy and free a dead device.
+ * Called when last reference to device kobject is gone.
+ */
+static void netdev_release(struct device *d)
+{
+ struct net_device *dev = to_net_dev(d);
+
+ BUG_ON(dev->reg_state != NETREG_RELEASED);
+
+ /* no need to wait for rcu grace period:
+ * device is dead and about to be freed.
+ */
+ kfree(rcu_access_pointer(dev->ifalias));
+ netdev_freemem(dev);
+}
+
+static const void *net_namespace(struct device *d)
+{
+ struct net_device *dev = to_net_dev(d);
+
+ return dev_net(dev);
+}
+
+static void net_get_ownership(struct device *d, kuid_t *uid, kgid_t *gid)
+{
+ struct net_device *dev = to_net_dev(d);
+ const struct net *net = dev_net(dev);
+
+ net_ns_get_ownership(net, uid, gid);
+}
+
+static struct class net_class __ro_after_init = {
+ .name = "net",
+ .dev_release = netdev_release,
+ .dev_groups = net_class_groups,
+ .dev_uevent = netdev_uevent,
+ .ns_type = &net_ns_type_operations,
+ .namespace = net_namespace,
+ .get_ownership = net_get_ownership,
+};
+
+#ifdef CONFIG_OF_NET
+static int of_dev_node_match(struct device *dev, const void *data)
+{
+ int ret = 0;
+
+ if (dev->parent)
+ ret = dev->parent->of_node == data;
+
+ return ret == 0 ? dev->of_node == data : ret;
+}
+
+/*
+ * of_find_net_device_by_node - lookup the net device for the device node
+ * @np: OF device node
+ *
+ * Looks up the net_device structure corresponding with the device node.
+ * If successful, returns a pointer to the net_device with the embedded
+ * struct device refcount incremented by one, or NULL on failure. The
+ * refcount must be dropped when done with the net_device.
+ */
+struct net_device *of_find_net_device_by_node(struct device_node *np)
+{
+ struct device *dev;
+
+ dev = class_find_device(&net_class, NULL, np, of_dev_node_match);
+ if (!dev)
+ return NULL;
+
+ return to_net_dev(dev);
+}
+EXPORT_SYMBOL(of_find_net_device_by_node);
+#endif
+
+/* Delete sysfs entries but hold kobject reference until after all
+ * netdev references are gone.
+ */
+void netdev_unregister_kobject(struct net_device *ndev)
+{
+ struct device *dev = &ndev->dev;
+
+ if (!refcount_read(&dev_net(ndev)->count))
+ dev_set_uevent_suppress(dev, 1);
+
+ kobject_get(&dev->kobj);
+
+ remove_queue_kobjects(ndev);
+
+ pm_runtime_set_memalloc_noio(dev, false);
+
+ device_del(dev);
+}
+
+/* Create sysfs entries for network device. */
+int netdev_register_kobject(struct net_device *ndev)
+{
+ struct device *dev = &ndev->dev;
+ const struct attribute_group **groups = ndev->sysfs_groups;
+ int error = 0;
+
+ device_initialize(dev);
+ dev->class = &net_class;
+ dev->platform_data = ndev;
+ dev->groups = groups;
+
+ dev_set_name(dev, "%s", ndev->name);
+
+#ifdef CONFIG_SYSFS
+ /* Allow for a device specific group */
+ if (*groups)
+ groups++;
+
+ *groups++ = &netstat_group;
+
+#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
+ if (ndev->ieee80211_ptr)
+ *groups++ = &wireless_group;
+#if IS_ENABLED(CONFIG_WIRELESS_EXT)
+ else if (ndev->wireless_handlers)
+ *groups++ = &wireless_group;
+#endif
+#endif
+#endif /* CONFIG_SYSFS */
+
+ error = device_add(dev);
+ if (error)
+ return error;
+
+ error = register_queue_kobjects(ndev);
+ if (error) {
+ device_del(dev);
+ return error;
+ }
+
+ pm_runtime_set_memalloc_noio(dev, true);
+
+ return error;
+}
+
+int netdev_class_create_file_ns(const struct class_attribute *class_attr,
+ const void *ns)
+{
+ return class_create_file_ns(&net_class, class_attr, ns);
+}
+EXPORT_SYMBOL(netdev_class_create_file_ns);
+
+void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
+ const void *ns)
+{
+ class_remove_file_ns(&net_class, class_attr, ns);
+}
+EXPORT_SYMBOL(netdev_class_remove_file_ns);
+
+int __init netdev_kobject_init(void)
+{
+ kobj_ns_type_register(&net_ns_type_operations);
+ return class_register(&net_class);
+}
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
new file mode 100644
index 000000000..006876c7b
--- /dev/null
+++ b/net/core/net-sysfs.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_SYSFS_H__
+#define __NET_SYSFS_H__
+
+int __init netdev_kobject_init(void);
+int netdev_register_kobject(struct net_device *);
+void netdev_unregister_kobject(struct net_device *);
+int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
+int netdev_queue_update_kobjects(struct net_device *net,
+ int old_num, int new_num);
+
+#endif
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
new file mode 100644
index 000000000..419af6dfe
--- /dev/null
+++ b/net/core/net-traces.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * consolidates trace point definitions
+ *
+ * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/export.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/rcupdate.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/netlink.h>
+#include <linux/net_dropmon.h>
+#include <linux/slab.h>
+
+#include <asm/unaligned.h>
+#include <asm/bitops.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/skb.h>
+#include <trace/events/net.h>
+#include <trace/events/napi.h>
+#include <trace/events/sock.h>
+#include <trace/events/udp.h>
+#include <trace/events/tcp.h>
+#include <trace/events/fib.h>
+#include <trace/events/qdisc.h>
+#if IS_ENABLED(CONFIG_BRIDGE)
+#include <trace/events/bridge.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
+EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add);
+EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete);
+EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update);
+#endif
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
new file mode 100644
index 000000000..3368624be
--- /dev/null
+++ b/net/core/net_namespace.c
@@ -0,0 +1,1195 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/workqueue.h>
+#include <linux/rtnetlink.h>
+#include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/idr.h>
+#include <linux/rculist.h>
+#include <linux/nsproxy.h>
+#include <linux/fs.h>
+#include <linux/proc_ns.h>
+#include <linux/file.h>
+#include <linux/export.h>
+#include <linux/user_namespace.h>
+#include <linux/net_namespace.h>
+#include <linux/sched/task.h>
+#include <linux/uidgid.h>
+
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+/*
+ * Our network namespace constructor/destructor lists
+ */
+
+static LIST_HEAD(pernet_list);
+static struct list_head *first_device = &pernet_list;
+
+LIST_HEAD(net_namespace_list);
+EXPORT_SYMBOL_GPL(net_namespace_list);
+
+/* Protects net_namespace_list. Nests iside rtnl_lock() */
+DECLARE_RWSEM(net_rwsem);
+EXPORT_SYMBOL_GPL(net_rwsem);
+
+struct net init_net = {
+ .count = REFCOUNT_INIT(1),
+ .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
+};
+EXPORT_SYMBOL(init_net);
+
+static bool init_net_initialized;
+/*
+ * pernet_ops_rwsem: protects: pernet_list, net_generic_ids,
+ * init_net_initialized and first_device pointer.
+ * This is internal net namespace object. Please, don't use it
+ * outside.
+ */
+DECLARE_RWSEM(pernet_ops_rwsem);
+EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
+
+#define MIN_PERNET_OPS_ID \
+ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
+
+#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
+
+static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
+
+static struct net_generic *net_alloc_generic(void)
+{
+ struct net_generic *ng;
+ unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
+
+ ng = kzalloc(generic_size, GFP_KERNEL);
+ if (ng)
+ ng->s.len = max_gen_ptrs;
+
+ return ng;
+}
+
+static int net_assign_generic(struct net *net, unsigned int id, void *data)
+{
+ struct net_generic *ng, *old_ng;
+
+ BUG_ON(id < MIN_PERNET_OPS_ID);
+
+ old_ng = rcu_dereference_protected(net->gen,
+ lockdep_is_held(&pernet_ops_rwsem));
+ if (old_ng->s.len > id) {
+ old_ng->ptr[id] = data;
+ return 0;
+ }
+
+ ng = net_alloc_generic();
+ if (ng == NULL)
+ return -ENOMEM;
+
+ /*
+ * Some synchronisation notes:
+ *
+ * The net_generic explores the net->gen array inside rcu
+ * read section. Besides once set the net->gen->ptr[x]
+ * pointer never changes (see rules in netns/generic.h).
+ *
+ * That said, we simply duplicate this array and schedule
+ * the old copy for kfree after a grace period.
+ */
+
+ memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID],
+ (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *));
+ ng->ptr[id] = data;
+
+ rcu_assign_pointer(net->gen, ng);
+ kfree_rcu(old_ng, s.rcu);
+ return 0;
+}
+
+static int ops_init(const struct pernet_operations *ops, struct net *net)
+{
+ int err = -ENOMEM;
+ void *data = NULL;
+
+ if (ops->id && ops->size) {
+ data = kzalloc(ops->size, GFP_KERNEL);
+ if (!data)
+ goto out;
+
+ err = net_assign_generic(net, *ops->id, data);
+ if (err)
+ goto cleanup;
+ }
+ err = 0;
+ if (ops->init)
+ err = ops->init(net);
+ if (!err)
+ return 0;
+
+cleanup:
+ kfree(data);
+
+out:
+ return err;
+}
+
+static void ops_free(const struct pernet_operations *ops, struct net *net)
+{
+ if (ops->id && ops->size) {
+ kfree(net_generic(net, *ops->id));
+ }
+}
+
+static void ops_exit_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ struct net *net;
+ if (ops->exit) {
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ ops->exit(net);
+ cond_resched();
+ }
+ }
+ if (ops->exit_batch)
+ ops->exit_batch(net_exit_list);
+}
+
+static void ops_free_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ struct net *net;
+ if (ops->size && ops->id) {
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ops_free(ops, net);
+ }
+}
+
+/* should be called with nsid_lock held */
+static int alloc_netid(struct net *net, struct net *peer, int reqid)
+{
+ int min = 0, max = 0;
+
+ if (reqid >= 0) {
+ min = reqid;
+ max = reqid + 1;
+ }
+
+ return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC);
+}
+
+/* This function is used by idr_for_each(). If net is equal to peer, the
+ * function returns the id so that idr_for_each() stops. Because we cannot
+ * returns the id 0 (idr_for_each() will not stop), we return the magic value
+ * NET_ID_ZERO (-1) for it.
+ */
+#define NET_ID_ZERO -1
+static int net_eq_idr(int id, void *net, void *peer)
+{
+ if (net_eq(net, peer))
+ return id ? : NET_ID_ZERO;
+ return 0;
+}
+
+/* Must be called from RCU-critical section or with nsid_lock held. If
+ * a new id is assigned, the bool alloc is set to true, thus the
+ * caller knows that the new id must be notified via rtnl.
+ */
+static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
+{
+ int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
+ bool alloc_it = *alloc;
+
+ *alloc = false;
+
+ /* Magic value for id 0. */
+ if (id == NET_ID_ZERO)
+ return 0;
+ if (id > 0)
+ return id;
+
+ if (alloc_it) {
+ id = alloc_netid(net, peer, -1);
+ *alloc = true;
+ return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
+ }
+
+ return NETNSA_NSID_NOT_ASSIGNED;
+}
+
+/* Must be called from RCU-critical section or with nsid_lock held */
+static int __peernet2id(struct net *net, struct net *peer)
+{
+ bool no = false;
+
+ return __peernet2id_alloc(net, peer, &no);
+}
+
+static void rtnl_net_notifyid(struct net *net, int cmd, int id, gfp_t gfp);
+/* This function returns the id of a peer netns. If no id is assigned, one will
+ * be allocated and returned.
+ */
+int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
+{
+ bool alloc = false, alive = false;
+ int id;
+
+ if (refcount_read(&net->count) == 0)
+ return NETNSA_NSID_NOT_ASSIGNED;
+ spin_lock_bh(&net->nsid_lock);
+ /*
+ * When peer is obtained from RCU lists, we may race with
+ * its cleanup. Check whether it's alive, and this guarantees
+ * we never hash a peer back to net->netns_ids, after it has
+ * just been idr_remove()'d from there in cleanup_net().
+ */
+ if (maybe_get_net(peer))
+ alive = alloc = true;
+ id = __peernet2id_alloc(net, peer, &alloc);
+ spin_unlock_bh(&net->nsid_lock);
+ if (alloc && id >= 0)
+ rtnl_net_notifyid(net, RTM_NEWNSID, id, gfp);
+ if (alive)
+ put_net(peer);
+ return id;
+}
+EXPORT_SYMBOL_GPL(peernet2id_alloc);
+
+/* This function returns, if assigned, the id of a peer netns. */
+int peernet2id(struct net *net, struct net *peer)
+{
+ int id;
+
+ rcu_read_lock();
+ id = __peernet2id(net, peer);
+ rcu_read_unlock();
+
+ return id;
+}
+EXPORT_SYMBOL(peernet2id);
+
+/* This function returns true is the peer netns has an id assigned into the
+ * current netns.
+ */
+bool peernet_has_id(struct net *net, struct net *peer)
+{
+ return peernet2id(net, peer) >= 0;
+}
+
+struct net *get_net_ns_by_id(struct net *net, int id)
+{
+ struct net *peer;
+
+ if (id < 0)
+ return NULL;
+
+ rcu_read_lock();
+ peer = idr_find(&net->netns_ids, id);
+ if (peer)
+ peer = maybe_get_net(peer);
+ rcu_read_unlock();
+
+ return peer;
+}
+
+/*
+ * setup_net runs the initializers for the network namespace object.
+ */
+static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
+{
+ /* Must be called with pernet_ops_rwsem held */
+ const struct pernet_operations *ops, *saved_ops;
+ int error = 0;
+ LIST_HEAD(net_exit_list);
+
+ refcount_set(&net->count, 1);
+ refcount_set(&net->passive, 1);
+ get_random_bytes(&net->hash_mix, sizeof(u32));
+ net->dev_base_seq = 1;
+ net->user_ns = user_ns;
+ idr_init(&net->netns_ids);
+ spin_lock_init(&net->nsid_lock);
+ mutex_init(&net->ipv4.ra_mutex);
+
+ list_for_each_entry(ops, &pernet_list, list) {
+ error = ops_init(ops, net);
+ if (error < 0)
+ goto out_undo;
+ }
+ down_write(&net_rwsem);
+ list_add_tail_rcu(&net->list, &net_namespace_list);
+ up_write(&net_rwsem);
+out:
+ return error;
+
+out_undo:
+ /* Walk through the list backwards calling the exit functions
+ * for the pernet modules whose init functions did not fail.
+ */
+ list_add(&net->exit_list, &net_exit_list);
+ saved_ops = ops;
+ list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+ ops_exit_list(ops, &net_exit_list);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+ ops_free_list(ops, &net_exit_list);
+
+ rcu_barrier();
+ goto out;
+}
+
+static int __net_init net_defaults_init_net(struct net *net)
+{
+ net->core.sysctl_somaxconn = SOMAXCONN;
+ return 0;
+}
+
+static struct pernet_operations net_defaults_ops = {
+ .init = net_defaults_init_net,
+};
+
+static __init int net_defaults_init(void)
+{
+ if (register_pernet_subsys(&net_defaults_ops))
+ panic("Cannot initialize net default settings");
+
+ return 0;
+}
+
+core_initcall(net_defaults_init);
+
+#ifdef CONFIG_NET_NS
+static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
+}
+
+static void dec_net_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
+}
+
+static struct kmem_cache *net_cachep __ro_after_init;
+static struct workqueue_struct *netns_wq;
+
+static struct net *net_alloc(void)
+{
+ struct net *net = NULL;
+ struct net_generic *ng;
+
+ ng = net_alloc_generic();
+ if (!ng)
+ goto out;
+
+ net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
+ if (!net)
+ goto out_free;
+
+ rcu_assign_pointer(net->gen, ng);
+out:
+ return net;
+
+out_free:
+ kfree(ng);
+ goto out;
+}
+
+static void net_free(struct net *net)
+{
+ kfree(rcu_access_pointer(net->gen));
+ kmem_cache_free(net_cachep, net);
+}
+
+void net_drop_ns(void *p)
+{
+ struct net *ns = p;
+ if (ns && refcount_dec_and_test(&ns->passive))
+ net_free(ns);
+}
+
+struct net *copy_net_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct net *old_net)
+{
+ struct ucounts *ucounts;
+ struct net *net;
+ int rv;
+
+ if (!(flags & CLONE_NEWNET))
+ return get_net(old_net);
+
+ ucounts = inc_net_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+
+ net = net_alloc();
+ if (!net) {
+ rv = -ENOMEM;
+ goto dec_ucounts;
+ }
+ refcount_set(&net->passive, 1);
+ net->ucounts = ucounts;
+ get_user_ns(user_ns);
+
+ rv = down_read_killable(&pernet_ops_rwsem);
+ if (rv < 0)
+ goto put_userns;
+
+ rv = setup_net(net, user_ns);
+
+ up_read(&pernet_ops_rwsem);
+
+ if (rv < 0) {
+put_userns:
+ put_user_ns(user_ns);
+ net_drop_ns(net);
+dec_ucounts:
+ dec_net_namespaces(ucounts);
+ return ERR_PTR(rv);
+ }
+ return net;
+}
+
+/**
+ * net_ns_get_ownership - get sysfs ownership data for @net
+ * @net: network namespace in question (can be NULL)
+ * @uid: kernel user ID for sysfs objects
+ * @gid: kernel group ID for sysfs objects
+ *
+ * Returns the uid/gid pair of root in the user namespace associated with the
+ * given network namespace.
+ */
+void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid)
+{
+ if (net) {
+ kuid_t ns_root_uid = make_kuid(net->user_ns, 0);
+ kgid_t ns_root_gid = make_kgid(net->user_ns, 0);
+
+ if (uid_valid(ns_root_uid))
+ *uid = ns_root_uid;
+
+ if (gid_valid(ns_root_gid))
+ *gid = ns_root_gid;
+ } else {
+ *uid = GLOBAL_ROOT_UID;
+ *gid = GLOBAL_ROOT_GID;
+ }
+}
+EXPORT_SYMBOL_GPL(net_ns_get_ownership);
+
+static void unhash_nsid(struct net *net, struct net *last)
+{
+ struct net *tmp;
+ /* This function is only called from cleanup_net() work,
+ * and this work is the only process, that may delete
+ * a net from net_namespace_list. So, when the below
+ * is executing, the list may only grow. Thus, we do not
+ * use for_each_net_rcu() or net_rwsem.
+ */
+ for_each_net(tmp) {
+ int id;
+
+ spin_lock_bh(&tmp->nsid_lock);
+ id = __peernet2id(tmp, net);
+ if (id >= 0)
+ idr_remove(&tmp->netns_ids, id);
+ spin_unlock_bh(&tmp->nsid_lock);
+ if (id >= 0)
+ rtnl_net_notifyid(tmp, RTM_DELNSID, id,
+ GFP_KERNEL);
+ if (tmp == last)
+ break;
+ }
+ spin_lock_bh(&net->nsid_lock);
+ idr_destroy(&net->netns_ids);
+ spin_unlock_bh(&net->nsid_lock);
+}
+
+static LLIST_HEAD(cleanup_list);
+
+static void cleanup_net(struct work_struct *work)
+{
+ const struct pernet_operations *ops;
+ struct net *net, *tmp, *last;
+ struct llist_node *net_kill_list;
+ LIST_HEAD(net_exit_list);
+
+ /* Atomically snapshot the list of namespaces to cleanup */
+ net_kill_list = llist_del_all(&cleanup_list);
+
+ down_read(&pernet_ops_rwsem);
+
+ /* Don't let anyone else find us. */
+ down_write(&net_rwsem);
+ llist_for_each_entry(net, net_kill_list, cleanup_list)
+ list_del_rcu(&net->list);
+ /* Cache last net. After we unlock rtnl, no one new net
+ * added to net_namespace_list can assign nsid pointer
+ * to a net from net_kill_list (see peernet2id_alloc()).
+ * So, we skip them in unhash_nsid().
+ *
+ * Note, that unhash_nsid() does not delete nsid links
+ * between net_kill_list's nets, as they've already
+ * deleted from net_namespace_list. But, this would be
+ * useless anyway, as netns_ids are destroyed there.
+ */
+ last = list_last_entry(&net_namespace_list, struct net, list);
+ up_write(&net_rwsem);
+
+ llist_for_each_entry(net, net_kill_list, cleanup_list) {
+ unhash_nsid(net, last);
+ list_add_tail(&net->exit_list, &net_exit_list);
+ }
+
+ /*
+ * Another CPU might be rcu-iterating the list, wait for it.
+ * This needs to be before calling the exit() notifiers, so
+ * the rcu_barrier() below isn't sufficient alone.
+ */
+ synchronize_rcu();
+
+ /* Run all of the network namespace exit methods */
+ list_for_each_entry_reverse(ops, &pernet_list, list)
+ ops_exit_list(ops, &net_exit_list);
+
+ /* Free the net generic variables */
+ list_for_each_entry_reverse(ops, &pernet_list, list)
+ ops_free_list(ops, &net_exit_list);
+
+ up_read(&pernet_ops_rwsem);
+
+ /* Ensure there are no outstanding rcu callbacks using this
+ * network namespace.
+ */
+ rcu_barrier();
+
+ /* Finally it is safe to free my network namespace structure */
+ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
+ list_del_init(&net->exit_list);
+ dec_net_namespaces(net->ucounts);
+ put_user_ns(net->user_ns);
+ net_drop_ns(net);
+ }
+}
+
+/**
+ * net_ns_barrier - wait until concurrent net_cleanup_work is done
+ *
+ * cleanup_net runs from work queue and will first remove namespaces
+ * from the global list, then run net exit functions.
+ *
+ * Call this in module exit path to make sure that all netns
+ * ->exit ops have been invoked before the function is removed.
+ */
+void net_ns_barrier(void)
+{
+ down_write(&pernet_ops_rwsem);
+ up_write(&pernet_ops_rwsem);
+}
+EXPORT_SYMBOL(net_ns_barrier);
+
+static DECLARE_WORK(net_cleanup_work, cleanup_net);
+
+void __put_net(struct net *net)
+{
+ /* Cleanup the network namespace in process context */
+ if (llist_add(&net->cleanup_list, &cleanup_list))
+ queue_work(netns_wq, &net_cleanup_work);
+}
+EXPORT_SYMBOL_GPL(__put_net);
+
+/**
+ * get_net_ns - increment the refcount of the network namespace
+ * @ns: common namespace (net)
+ *
+ * Returns the net's common namespace.
+ */
+struct ns_common *get_net_ns(struct ns_common *ns)
+{
+ return &get_net(container_of(ns, struct net, ns))->ns;
+}
+EXPORT_SYMBOL_GPL(get_net_ns);
+
+struct net *get_net_ns_by_fd(int fd)
+{
+ struct file *file;
+ struct ns_common *ns;
+ struct net *net;
+
+ file = proc_ns_fget(fd);
+ if (IS_ERR(file))
+ return ERR_CAST(file);
+
+ ns = get_proc_ns(file_inode(file));
+ if (ns->ops == &netns_operations)
+ net = get_net(container_of(ns, struct net, ns));
+ else
+ net = ERR_PTR(-EINVAL);
+
+ fput(file);
+ return net;
+}
+
+#else
+struct net *get_net_ns_by_fd(int fd)
+{
+ return ERR_PTR(-EINVAL);
+}
+#endif
+EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
+
+struct net *get_net_ns_by_pid(pid_t pid)
+{
+ struct task_struct *tsk;
+ struct net *net;
+
+ /* Lookup the network namespace */
+ net = ERR_PTR(-ESRCH);
+ rcu_read_lock();
+ tsk = find_task_by_vpid(pid);
+ if (tsk) {
+ struct nsproxy *nsproxy;
+ task_lock(tsk);
+ nsproxy = tsk->nsproxy;
+ if (nsproxy)
+ net = get_net(nsproxy->net_ns);
+ task_unlock(tsk);
+ }
+ rcu_read_unlock();
+ return net;
+}
+EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
+
+static __net_init int net_ns_net_init(struct net *net)
+{
+#ifdef CONFIG_NET_NS
+ net->ns.ops = &netns_operations;
+#endif
+ return ns_alloc_inum(&net->ns);
+}
+
+static __net_exit void net_ns_net_exit(struct net *net)
+{
+ ns_free_inum(&net->ns);
+}
+
+static struct pernet_operations __net_initdata net_ns_ops = {
+ .init = net_ns_net_init,
+ .exit = net_ns_net_exit,
+};
+
+static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
+ [NETNSA_NONE] = { .type = NLA_UNSPEC },
+ [NETNSA_NSID] = { .type = NLA_S32 },
+ [NETNSA_PID] = { .type = NLA_U32 },
+ [NETNSA_FD] = { .type = NLA_U32 },
+};
+
+static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[NETNSA_MAX + 1];
+ struct nlattr *nla;
+ struct net *peer;
+ int nsid, err;
+
+ err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
+ rtnl_net_policy, extack);
+ if (err < 0)
+ return err;
+ if (!tb[NETNSA_NSID]) {
+ NL_SET_ERR_MSG(extack, "nsid is missing");
+ return -EINVAL;
+ }
+ nsid = nla_get_s32(tb[NETNSA_NSID]);
+
+ if (tb[NETNSA_PID]) {
+ peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
+ nla = tb[NETNSA_PID];
+ } else if (tb[NETNSA_FD]) {
+ peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
+ nla = tb[NETNSA_FD];
+ } else {
+ NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
+ return -EINVAL;
+ }
+ if (IS_ERR(peer)) {
+ NL_SET_BAD_ATTR(extack, nla);
+ NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
+ return PTR_ERR(peer);
+ }
+
+ spin_lock_bh(&net->nsid_lock);
+ if (__peernet2id(net, peer) >= 0) {
+ spin_unlock_bh(&net->nsid_lock);
+ err = -EEXIST;
+ NL_SET_BAD_ATTR(extack, nla);
+ NL_SET_ERR_MSG(extack,
+ "Peer netns already has a nsid assigned");
+ goto out;
+ }
+
+ err = alloc_netid(net, peer, nsid);
+ spin_unlock_bh(&net->nsid_lock);
+ if (err >= 0) {
+ rtnl_net_notifyid(net, RTM_NEWNSID, err, GFP_KERNEL);
+ err = 0;
+ } else if (err == -ENOSPC && nsid >= 0) {
+ err = -EEXIST;
+ NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]);
+ NL_SET_ERR_MSG(extack, "The specified nsid is already used");
+ }
+out:
+ put_net(peer);
+ return err;
+}
+
+static int rtnl_net_get_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct rtgenmsg))
+ + nla_total_size(sizeof(s32)) /* NETNSA_NSID */
+ ;
+}
+
+static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
+ int cmd, struct net *net, int nsid)
+{
+ struct nlmsghdr *nlh;
+ struct rtgenmsg *rth;
+
+ nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rth = nlmsg_data(nlh);
+ rth->rtgen_family = AF_UNSPEC;
+
+ if (nla_put_s32(skb, NETNSA_NSID, nsid))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[NETNSA_MAX + 1];
+ struct nlattr *nla;
+ struct sk_buff *msg;
+ struct net *peer;
+ int err, id;
+
+ err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
+ rtnl_net_policy, extack);
+ if (err < 0)
+ return err;
+ if (tb[NETNSA_PID]) {
+ peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
+ nla = tb[NETNSA_PID];
+ } else if (tb[NETNSA_FD]) {
+ peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
+ nla = tb[NETNSA_FD];
+ } else {
+ NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
+ return -EINVAL;
+ }
+
+ if (IS_ERR(peer)) {
+ NL_SET_BAD_ATTR(extack, nla);
+ NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
+ return PTR_ERR(peer);
+ }
+
+ msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
+ if (!msg) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ id = peernet2id(net, peer);
+ err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ RTM_NEWNSID, net, id);
+ if (err < 0)
+ goto err_out;
+
+ err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid);
+ goto out;
+
+err_out:
+ nlmsg_free(msg);
+out:
+ put_net(peer);
+ return err;
+}
+
+struct rtnl_net_dump_cb {
+ struct net *net;
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
+ int idx;
+ int s_idx;
+};
+
+/* Runs in RCU-critical section. */
+static int rtnl_net_dumpid_one(int id, void *peer, void *data)
+{
+ struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data;
+ int ret;
+
+ if (net_cb->idx < net_cb->s_idx)
+ goto cont;
+
+ ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid,
+ net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWNSID, net_cb->net, id);
+ if (ret < 0)
+ return ret;
+
+cont:
+ net_cb->idx++;
+ return 0;
+}
+
+static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct rtnl_net_dump_cb net_cb = {
+ .net = net,
+ .skb = skb,
+ .cb = cb,
+ .idx = 0,
+ .s_idx = cb->args[0],
+ };
+
+ rcu_read_lock();
+ idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
+ rcu_read_unlock();
+
+ cb->args[0] = net_cb.idx;
+ return skb->len;
+}
+
+static void rtnl_net_notifyid(struct net *net, int cmd, int id, gfp_t gfp)
+{
+ struct sk_buff *msg;
+ int err = -ENOMEM;
+
+ msg = nlmsg_new(rtnl_net_get_size(), gfp);
+ if (!msg)
+ goto out;
+
+ err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id);
+ if (err < 0)
+ goto err_out;
+
+ rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, gfp);
+ return;
+
+err_out:
+ nlmsg_free(msg);
+out:
+ rtnl_set_sk_err(net, RTNLGRP_NSID, err);
+}
+
+static int __init net_ns_init(void)
+{
+ struct net_generic *ng;
+
+#ifdef CONFIG_NET_NS
+ net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
+ SMP_CACHE_BYTES,
+ SLAB_PANIC|SLAB_ACCOUNT, NULL);
+
+ /* Create workqueue for cleanup */
+ netns_wq = create_singlethread_workqueue("netns");
+ if (!netns_wq)
+ panic("Could not create netns workq");
+#endif
+
+ ng = net_alloc_generic();
+ if (!ng)
+ panic("Could not allocate generic netns");
+
+ rcu_assign_pointer(init_net.gen, ng);
+
+ down_write(&pernet_ops_rwsem);
+ if (setup_net(&init_net, &init_user_ns))
+ panic("Could not setup the initial network namespace");
+
+ init_net_initialized = true;
+ up_write(&pernet_ops_rwsem);
+
+ if (register_pernet_subsys(&net_ns_ops))
+ panic("Could not register network namespace subsystems");
+
+ rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL,
+ RTNL_FLAG_DOIT_UNLOCKED);
+ rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
+ RTNL_FLAG_DOIT_UNLOCKED);
+
+ return 0;
+}
+
+pure_initcall(net_ns_init);
+
+#ifdef CONFIG_NET_NS
+static int __register_pernet_operations(struct list_head *list,
+ struct pernet_operations *ops)
+{
+ struct net *net;
+ int error;
+ LIST_HEAD(net_exit_list);
+
+ list_add_tail(&ops->list, list);
+ if (ops->init || (ops->id && ops->size)) {
+ /* We held write locked pernet_ops_rwsem, and parallel
+ * setup_net() and cleanup_net() are not possible.
+ */
+ for_each_net(net) {
+ error = ops_init(ops, net);
+ if (error)
+ goto out_undo;
+ list_add_tail(&net->exit_list, &net_exit_list);
+ }
+ }
+ return 0;
+
+out_undo:
+ /* If I have an error cleanup all namespaces I initialized */
+ list_del(&ops->list);
+ ops_exit_list(ops, &net_exit_list);
+ ops_free_list(ops, &net_exit_list);
+ return error;
+}
+
+static void __unregister_pernet_operations(struct pernet_operations *ops)
+{
+ struct net *net;
+ LIST_HEAD(net_exit_list);
+
+ list_del(&ops->list);
+ /* See comment in __register_pernet_operations() */
+ for_each_net(net)
+ list_add_tail(&net->exit_list, &net_exit_list);
+ ops_exit_list(ops, &net_exit_list);
+ ops_free_list(ops, &net_exit_list);
+}
+
+#else
+
+static int __register_pernet_operations(struct list_head *list,
+ struct pernet_operations *ops)
+{
+ if (!init_net_initialized) {
+ list_add_tail(&ops->list, list);
+ return 0;
+ }
+
+ return ops_init(ops, &init_net);
+}
+
+static void __unregister_pernet_operations(struct pernet_operations *ops)
+{
+ if (!init_net_initialized) {
+ list_del(&ops->list);
+ } else {
+ LIST_HEAD(net_exit_list);
+ list_add(&init_net.exit_list, &net_exit_list);
+ ops_exit_list(ops, &net_exit_list);
+ ops_free_list(ops, &net_exit_list);
+ }
+}
+
+#endif /* CONFIG_NET_NS */
+
+static DEFINE_IDA(net_generic_ids);
+
+static int register_pernet_operations(struct list_head *list,
+ struct pernet_operations *ops)
+{
+ int error;
+
+ if (ops->id) {
+ error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID,
+ GFP_KERNEL);
+ if (error < 0)
+ return error;
+ *ops->id = error;
+ max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1);
+ }
+ error = __register_pernet_operations(list, ops);
+ if (error) {
+ rcu_barrier();
+ if (ops->id)
+ ida_free(&net_generic_ids, *ops->id);
+ }
+
+ return error;
+}
+
+static void unregister_pernet_operations(struct pernet_operations *ops)
+{
+ __unregister_pernet_operations(ops);
+ rcu_barrier();
+ if (ops->id)
+ ida_free(&net_generic_ids, *ops->id);
+}
+
+/**
+ * register_pernet_subsys - register a network namespace subsystem
+ * @ops: pernet operations structure for the subsystem
+ *
+ * Register a subsystem which has init and exit functions
+ * that are called when network namespaces are created and
+ * destroyed respectively.
+ *
+ * When registered all network namespace init functions are
+ * called for every existing network namespace. Allowing kernel
+ * modules to have a race free view of the set of network namespaces.
+ *
+ * When a new network namespace is created all of the init
+ * methods are called in the order in which they were registered.
+ *
+ * When a network namespace is destroyed all of the exit methods
+ * are called in the reverse of the order with which they were
+ * registered.
+ */
+int register_pernet_subsys(struct pernet_operations *ops)
+{
+ int error;
+ down_write(&pernet_ops_rwsem);
+ error = register_pernet_operations(first_device, ops);
+ up_write(&pernet_ops_rwsem);
+ return error;
+}
+EXPORT_SYMBOL_GPL(register_pernet_subsys);
+
+/**
+ * unregister_pernet_subsys - unregister a network namespace subsystem
+ * @ops: pernet operations structure to manipulate
+ *
+ * Remove the pernet operations structure from the list to be
+ * used when network namespaces are created or destroyed. In
+ * addition run the exit method for all existing network
+ * namespaces.
+ */
+void unregister_pernet_subsys(struct pernet_operations *ops)
+{
+ down_write(&pernet_ops_rwsem);
+ unregister_pernet_operations(ops);
+ up_write(&pernet_ops_rwsem);
+}
+EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
+
+/**
+ * register_pernet_device - register a network namespace device
+ * @ops: pernet operations structure for the subsystem
+ *
+ * Register a device which has init and exit functions
+ * that are called when network namespaces are created and
+ * destroyed respectively.
+ *
+ * When registered all network namespace init functions are
+ * called for every existing network namespace. Allowing kernel
+ * modules to have a race free view of the set of network namespaces.
+ *
+ * When a new network namespace is created all of the init
+ * methods are called in the order in which they were registered.
+ *
+ * When a network namespace is destroyed all of the exit methods
+ * are called in the reverse of the order with which they were
+ * registered.
+ */
+int register_pernet_device(struct pernet_operations *ops)
+{
+ int error;
+ down_write(&pernet_ops_rwsem);
+ error = register_pernet_operations(&pernet_list, ops);
+ if (!error && (first_device == &pernet_list))
+ first_device = &ops->list;
+ up_write(&pernet_ops_rwsem);
+ return error;
+}
+EXPORT_SYMBOL_GPL(register_pernet_device);
+
+/**
+ * unregister_pernet_device - unregister a network namespace netdevice
+ * @ops: pernet operations structure to manipulate
+ *
+ * Remove the pernet operations structure from the list to be
+ * used when network namespaces are created or destroyed. In
+ * addition run the exit method for all existing network
+ * namespaces.
+ */
+void unregister_pernet_device(struct pernet_operations *ops)
+{
+ down_write(&pernet_ops_rwsem);
+ if (&ops->list == first_device)
+ first_device = first_device->next;
+ unregister_pernet_operations(ops);
+ up_write(&pernet_ops_rwsem);
+}
+EXPORT_SYMBOL_GPL(unregister_pernet_device);
+
+#ifdef CONFIG_NET_NS
+static struct ns_common *netns_get(struct task_struct *task)
+{
+ struct net *net = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy)
+ net = get_net(nsproxy->net_ns);
+ task_unlock(task);
+
+ return net ? &net->ns : NULL;
+}
+
+static inline struct net *to_net_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct net, ns);
+}
+
+static void netns_put(struct ns_common *ns)
+{
+ put_net(to_net_ns(ns));
+}
+
+static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+ struct net *net = to_net_ns(ns);
+
+ if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EPERM;
+
+ put_net(nsproxy->net_ns);
+ nsproxy->net_ns = get_net(net);
+ return 0;
+}
+
+static struct user_namespace *netns_owner(struct ns_common *ns)
+{
+ return to_net_ns(ns)->user_ns;
+}
+
+const struct proc_ns_operations netns_operations = {
+ .name = "net",
+ .type = CLONE_NEWNET,
+ .get = netns_get,
+ .put = netns_put,
+ .install = netns_install,
+ .owner = netns_owner,
+};
+#endif
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
new file mode 100644
index 000000000..668330ace
--- /dev/null
+++ b/net/core/netclassid_cgroup.c
@@ -0,0 +1,156 @@
+/*
+ * net/core/netclassid_cgroup.c Classid Cgroupfs Handling
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/slab.h>
+#include <linux/cgroup.h>
+#include <linux/fdtable.h>
+#include <linux/sched/task.h>
+
+#include <net/cls_cgroup.h>
+#include <net/sock.h>
+
+static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct cgroup_cls_state, css) : NULL;
+}
+
+struct cgroup_cls_state *task_cls_state(struct task_struct *p)
+{
+ return css_cls_state(task_css_check(p, net_cls_cgrp_id,
+ rcu_read_lock_bh_held()));
+}
+EXPORT_SYMBOL_GPL(task_cls_state);
+
+static struct cgroup_subsys_state *
+cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_cls_state *cs;
+
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+ if (!cs)
+ return ERR_PTR(-ENOMEM);
+
+ return &cs->css;
+}
+
+static int cgrp_css_online(struct cgroup_subsys_state *css)
+{
+ struct cgroup_cls_state *cs = css_cls_state(css);
+ struct cgroup_cls_state *parent = css_cls_state(css->parent);
+
+ if (parent)
+ cs->classid = parent->classid;
+
+ return 0;
+}
+
+static void cgrp_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_cls_state(css));
+}
+
+/*
+ * To avoid freezing of sockets creation for tasks with big number of threads
+ * and opened sockets lets release file_lock every 1000 iterated descriptors.
+ * New sockets will already have been created with new classid.
+ */
+
+struct update_classid_context {
+ u32 classid;
+ unsigned int batch;
+};
+
+#define UPDATE_CLASSID_BATCH 1000
+
+static int update_classid_sock(const void *v, struct file *file, unsigned n)
+{
+ int err;
+ struct update_classid_context *ctx = (void *)v;
+ struct socket *sock = sock_from_file(file, &err);
+
+ if (sock) {
+ spin_lock(&cgroup_sk_update_lock);
+ sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
+ spin_unlock(&cgroup_sk_update_lock);
+ }
+ if (--ctx->batch == 0) {
+ ctx->batch = UPDATE_CLASSID_BATCH;
+ return n + 1;
+ }
+ return 0;
+}
+
+static void update_classid_task(struct task_struct *p, u32 classid)
+{
+ struct update_classid_context ctx = {
+ .classid = classid,
+ .batch = UPDATE_CLASSID_BATCH
+ };
+ unsigned int fd = 0;
+
+ do {
+ task_lock(p);
+ fd = iterate_fd(p->files, fd, update_classid_sock, &ctx);
+ task_unlock(p);
+ cond_resched();
+ } while (fd);
+}
+
+static void cgrp_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *css;
+ struct task_struct *p;
+
+ cgroup_taskset_for_each(p, css, tset) {
+ update_classid_task(p, css_cls_state(css)->classid);
+ }
+}
+
+static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return css_cls_state(css)->classid;
+}
+
+static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 value)
+{
+ struct cgroup_cls_state *cs = css_cls_state(css);
+ struct css_task_iter it;
+ struct task_struct *p;
+
+ cgroup_sk_alloc_disable();
+
+ cs->classid = (u32)value;
+
+ css_task_iter_start(css, 0, &it);
+ while ((p = css_task_iter_next(&it)))
+ update_classid_task(p, cs->classid);
+ css_task_iter_end(&it);
+
+ return 0;
+}
+
+static struct cftype ss_files[] = {
+ {
+ .name = "classid",
+ .read_u64 = read_classid,
+ .write_u64 = write_classid,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys net_cls_cgrp_subsys = {
+ .css_alloc = cgrp_css_alloc,
+ .css_online = cgrp_css_online,
+ .css_free = cgrp_css_free,
+ .attach = cgrp_attach,
+ .legacy_cftypes = ss_files,
+};
diff --git a/net/core/netevent.c b/net/core/netevent.c
new file mode 100644
index 000000000..8b3bc4fac
--- /dev/null
+++ b/net/core/netevent.c
@@ -0,0 +1,67 @@
+/*
+ * Network event notifiers
+ *
+ * Authors:
+ * Tom Tucker <tom@opengridcomputing.com>
+ * Steve Wise <swise@opengridcomputing.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ */
+
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <linux/export.h>
+#include <net/netevent.h>
+
+static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain);
+
+/**
+ * register_netevent_notifier - register a netevent notifier block
+ * @nb: notifier
+ *
+ * Register a notifier to be called when a netevent occurs.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
+ */
+int register_netevent_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&netevent_notif_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_netevent_notifier);
+
+/**
+ * netevent_unregister_notifier - unregister a netevent notifier block
+ * @nb: notifier
+ *
+ * Unregister a notifier previously registered by
+ * register_neigh_notifier(). The notifier is unlinked into the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
+ */
+
+int unregister_netevent_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&netevent_notif_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_netevent_notifier);
+
+/**
+ * call_netevent_notifiers - call all netevent notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @v: pointer passed unmodified to notifier function
+ *
+ * Call all neighbour notifier blocks. Parameters and return value
+ * are as for notifier_call_chain().
+ */
+
+int call_netevent_notifiers(unsigned long val, void *v)
+{
+ return atomic_notifier_call_chain(&netevent_notif_chain, val, v);
+}
+EXPORT_SYMBOL_GPL(call_netevent_notifiers);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
new file mode 100644
index 000000000..41e32a958
--- /dev/null
+++ b/net/core/netpoll.c
@@ -0,0 +1,856 @@
+/*
+ * Common framework for low-level network console, dump, and debugger code
+ *
+ * Sep 8 2003 Matt Mackall <mpm@selenic.com>
+ *
+ * based on the netconsole code from:
+ *
+ * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Red Hat, Inc.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/if_vlan.h>
+#include <net/dsa.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/addrconf.h>
+#include <net/ndisc.h>
+#include <net/ip6_checksum.h>
+#include <asm/unaligned.h>
+#include <trace/events/napi.h>
+
+/*
+ * We maintain a small pool of fully-sized skbs, to make sure the
+ * message gets out even in extreme OOM situations.
+ */
+
+#define MAX_UDP_CHUNK 1460
+#define MAX_SKBS 32
+
+static struct sk_buff_head skb_pool;
+
+DEFINE_STATIC_SRCU(netpoll_srcu);
+
+#define USEC_PER_POLL 50
+
+#define MAX_SKB_SIZE \
+ (sizeof(struct ethhdr) + \
+ sizeof(struct iphdr) + \
+ sizeof(struct udphdr) + \
+ MAX_UDP_CHUNK)
+
+static void zap_completion_queue(void);
+static void netpoll_async_cleanup(struct work_struct *work);
+
+static unsigned int carrier_timeout = 4;
+module_param(carrier_timeout, uint, 0644);
+
+#define np_info(np, fmt, ...) \
+ pr_info("%s: " fmt, np->name, ##__VA_ARGS__)
+#define np_err(np, fmt, ...) \
+ pr_err("%s: " fmt, np->name, ##__VA_ARGS__)
+#define np_notice(np, fmt, ...) \
+ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__)
+
+static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
+ struct netdev_queue *txq)
+{
+ int status = NETDEV_TX_OK;
+ netdev_features_t features;
+
+ features = netif_skb_features(skb);
+
+ if (skb_vlan_tag_present(skb) &&
+ !vlan_hw_offload_capable(features, skb->vlan_proto)) {
+ skb = __vlan_hwaccel_push_inside(skb);
+ if (unlikely(!skb)) {
+ /* This is actually a packet drop, but we
+ * don't want the code that calls this
+ * function to try and operate on a NULL skb.
+ */
+ goto out;
+ }
+ }
+
+ status = netdev_start_xmit(skb, dev, txq, false);
+
+out:
+ return status;
+}
+
+static void queue_process(struct work_struct *work)
+{
+ struct netpoll_info *npinfo =
+ container_of(work, struct netpoll_info, tx_work.work);
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ while ((skb = skb_dequeue(&npinfo->txq))) {
+ struct net_device *dev = skb->dev;
+ struct netdev_queue *txq;
+ unsigned int q_index;
+
+ if (!netif_device_present(dev) || !netif_running(dev)) {
+ kfree_skb(skb);
+ continue;
+ }
+
+ local_irq_save(flags);
+ /* check if skb->queue_mapping is still valid */
+ q_index = skb_get_queue_mapping(skb);
+ if (unlikely(q_index >= dev->real_num_tx_queues)) {
+ q_index = q_index % dev->real_num_tx_queues;
+ skb_set_queue_mapping(skb, q_index);
+ }
+ txq = netdev_get_tx_queue(dev, q_index);
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
+ if (netif_xmit_frozen_or_stopped(txq) ||
+ !dev_xmit_complete(netpoll_start_xmit(skb, dev, txq))) {
+ skb_queue_head(&npinfo->txq, skb);
+ HARD_TX_UNLOCK(dev, txq);
+ local_irq_restore(flags);
+
+ schedule_delayed_work(&npinfo->tx_work, HZ/10);
+ return;
+ }
+ HARD_TX_UNLOCK(dev, txq);
+ local_irq_restore(flags);
+ }
+}
+
+static void poll_one_napi(struct napi_struct *napi)
+{
+ int work;
+
+ /* If we set this bit but see that it has already been set,
+ * that indicates that napi has been disabled and we need
+ * to abort this operation
+ */
+ if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state))
+ return;
+
+ /* We explicilty pass the polling call a budget of 0 to
+ * indicate that we are clearing the Tx path only.
+ */
+ work = napi->poll(napi, 0);
+ WARN_ONCE(work, "%pF exceeded budget in poll\n", napi->poll);
+ trace_napi_poll(napi, work, 0);
+
+ clear_bit(NAPI_STATE_NPSVC, &napi->state);
+}
+
+static void poll_napi(struct net_device *dev)
+{
+ struct napi_struct *napi;
+ int cpu = smp_processor_id();
+
+ list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
+ if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) {
+ poll_one_napi(napi);
+ smp_store_release(&napi->poll_owner, -1);
+ }
+ }
+}
+
+void netpoll_poll_dev(struct net_device *dev)
+{
+ struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
+ const struct net_device_ops *ops;
+
+ /* Don't do any rx activity if the dev_lock mutex is held
+ * the dev_open/close paths use this to block netpoll activity
+ * while changing device state
+ */
+ if (!ni || down_trylock(&ni->dev_lock))
+ return;
+
+ if (!netif_running(dev)) {
+ up(&ni->dev_lock);
+ return;
+ }
+
+ ops = dev->netdev_ops;
+ if (ops->ndo_poll_controller)
+ ops->ndo_poll_controller(dev);
+
+ poll_napi(dev);
+
+ up(&ni->dev_lock);
+
+ zap_completion_queue();
+}
+EXPORT_SYMBOL(netpoll_poll_dev);
+
+void netpoll_poll_disable(struct net_device *dev)
+{
+ struct netpoll_info *ni;
+ int idx;
+ might_sleep();
+ idx = srcu_read_lock(&netpoll_srcu);
+ ni = srcu_dereference(dev->npinfo, &netpoll_srcu);
+ if (ni)
+ down(&ni->dev_lock);
+ srcu_read_unlock(&netpoll_srcu, idx);
+}
+EXPORT_SYMBOL(netpoll_poll_disable);
+
+void netpoll_poll_enable(struct net_device *dev)
+{
+ struct netpoll_info *ni;
+ rcu_read_lock();
+ ni = rcu_dereference(dev->npinfo);
+ if (ni)
+ up(&ni->dev_lock);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(netpoll_poll_enable);
+
+static void refill_skbs(void)
+{
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ spin_lock_irqsave(&skb_pool.lock, flags);
+ while (skb_pool.qlen < MAX_SKBS) {
+ skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
+ if (!skb)
+ break;
+
+ __skb_queue_tail(&skb_pool, skb);
+ }
+ spin_unlock_irqrestore(&skb_pool.lock, flags);
+}
+
+static void zap_completion_queue(void)
+{
+ unsigned long flags;
+ struct softnet_data *sd = &get_cpu_var(softnet_data);
+
+ if (sd->completion_queue) {
+ struct sk_buff *clist;
+
+ local_irq_save(flags);
+ clist = sd->completion_queue;
+ sd->completion_queue = NULL;
+ local_irq_restore(flags);
+
+ while (clist != NULL) {
+ struct sk_buff *skb = clist;
+ clist = clist->next;
+ if (!skb_irq_freeable(skb)) {
+ refcount_set(&skb->users, 1);
+ dev_kfree_skb_any(skb); /* put this one back */
+ } else {
+ __kfree_skb(skb);
+ }
+ }
+ }
+
+ put_cpu_var(softnet_data);
+}
+
+static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
+{
+ int count = 0;
+ struct sk_buff *skb;
+
+ zap_completion_queue();
+ refill_skbs();
+repeat:
+
+ skb = alloc_skb(len, GFP_ATOMIC);
+ if (!skb)
+ skb = skb_dequeue(&skb_pool);
+
+ if (!skb) {
+ if (++count < 10) {
+ netpoll_poll_dev(np->dev);
+ goto repeat;
+ }
+ return NULL;
+ }
+
+ refcount_set(&skb->users, 1);
+ skb_reserve(skb, reserve);
+ return skb;
+}
+
+static int netpoll_owner_active(struct net_device *dev)
+{
+ struct napi_struct *napi;
+
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (napi->poll_owner == smp_processor_id())
+ return 1;
+ }
+ return 0;
+}
+
+/* call with IRQ disabled */
+void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
+ struct net_device *dev)
+{
+ int status = NETDEV_TX_BUSY;
+ unsigned long tries;
+ /* It is up to the caller to keep npinfo alive. */
+ struct netpoll_info *npinfo;
+
+ lockdep_assert_irqs_disabled();
+
+ npinfo = rcu_dereference_bh(np->dev->npinfo);
+ if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
+ dev_kfree_skb_irq(skb);
+ return;
+ }
+
+ /* don't get messages out of order, and no recursion */
+ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
+ struct netdev_queue *txq;
+
+ txq = netdev_pick_tx(dev, skb, NULL);
+
+ /* try until next clock tick */
+ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
+ tries > 0; --tries) {
+ if (HARD_TX_TRYLOCK(dev, txq)) {
+ if (!netif_xmit_stopped(txq))
+ status = netpoll_start_xmit(skb, dev, txq);
+
+ HARD_TX_UNLOCK(dev, txq);
+
+ if (dev_xmit_complete(status))
+ break;
+
+ }
+
+ /* tickle device maybe there is some cleanup */
+ netpoll_poll_dev(np->dev);
+
+ udelay(USEC_PER_POLL);
+ }
+
+ WARN_ONCE(!irqs_disabled(),
+ "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n",
+ dev->name, dev->netdev_ops->ndo_start_xmit);
+
+ }
+
+ if (!dev_xmit_complete(status)) {
+ skb_queue_tail(&npinfo->txq, skb);
+ schedule_delayed_work(&npinfo->tx_work,0);
+ }
+}
+EXPORT_SYMBOL(netpoll_send_skb_on_dev);
+
+void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
+{
+ int total_len, ip_len, udp_len;
+ struct sk_buff *skb;
+ struct udphdr *udph;
+ struct iphdr *iph;
+ struct ethhdr *eth;
+ static atomic_t ip_ident;
+ struct ipv6hdr *ip6h;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ udp_len = len + sizeof(*udph);
+ if (np->ipv6)
+ ip_len = udp_len + sizeof(*ip6h);
+ else
+ ip_len = udp_len + sizeof(*iph);
+
+ total_len = ip_len + LL_RESERVED_SPACE(np->dev);
+
+ skb = find_skb(np, total_len + np->dev->needed_tailroom,
+ total_len - len);
+ if (!skb)
+ return;
+
+ skb_copy_to_linear_data(skb, msg, len);
+ skb_put(skb, len);
+
+ skb_push(skb, sizeof(*udph));
+ skb_reset_transport_header(skb);
+ udph = udp_hdr(skb);
+ udph->source = htons(np->local_port);
+ udph->dest = htons(np->remote_port);
+ udph->len = htons(udp_len);
+
+ if (np->ipv6) {
+ udph->check = 0;
+ udph->check = csum_ipv6_magic(&np->local_ip.in6,
+ &np->remote_ip.in6,
+ udp_len, IPPROTO_UDP,
+ csum_partial(udph, udp_len, 0));
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+
+ skb_push(skb, sizeof(*ip6h));
+ skb_reset_network_header(skb);
+ ip6h = ipv6_hdr(skb);
+
+ /* ip6h->version = 6; ip6h->priority = 0; */
+ put_unaligned(0x60, (unsigned char *)ip6h);
+ ip6h->flow_lbl[0] = 0;
+ ip6h->flow_lbl[1] = 0;
+ ip6h->flow_lbl[2] = 0;
+
+ ip6h->payload_len = htons(sizeof(struct udphdr) + len);
+ ip6h->nexthdr = IPPROTO_UDP;
+ ip6h->hop_limit = 32;
+ ip6h->saddr = np->local_ip.in6;
+ ip6h->daddr = np->remote_ip.in6;
+
+ eth = skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb->protocol = eth->h_proto = htons(ETH_P_IPV6);
+ } else {
+ udph->check = 0;
+ udph->check = csum_tcpudp_magic(np->local_ip.ip,
+ np->remote_ip.ip,
+ udp_len, IPPROTO_UDP,
+ csum_partial(udph, udp_len, 0));
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+
+ skb_push(skb, sizeof(*iph));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+
+ /* iph->version = 4; iph->ihl = 5; */
+ put_unaligned(0x45, (unsigned char *)iph);
+ iph->tos = 0;
+ put_unaligned(htons(ip_len), &(iph->tot_len));
+ iph->id = htons(atomic_inc_return(&ip_ident));
+ iph->frag_off = 0;
+ iph->ttl = 64;
+ iph->protocol = IPPROTO_UDP;
+ iph->check = 0;
+ put_unaligned(np->local_ip.ip, &(iph->saddr));
+ put_unaligned(np->remote_ip.ip, &(iph->daddr));
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+
+ eth = skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb->protocol = eth->h_proto = htons(ETH_P_IP);
+ }
+
+ ether_addr_copy(eth->h_source, np->dev->dev_addr);
+ ether_addr_copy(eth->h_dest, np->remote_mac);
+
+ skb->dev = np->dev;
+
+ netpoll_send_skb(np, skb);
+}
+EXPORT_SYMBOL(netpoll_send_udp);
+
+void netpoll_print_options(struct netpoll *np)
+{
+ np_info(np, "local port %d\n", np->local_port);
+ if (np->ipv6)
+ np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6);
+ else
+ np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip);
+ np_info(np, "interface '%s'\n", np->dev_name);
+ np_info(np, "remote port %d\n", np->remote_port);
+ if (np->ipv6)
+ np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6);
+ else
+ np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip);
+ np_info(np, "remote ethernet address %pM\n", np->remote_mac);
+}
+EXPORT_SYMBOL(netpoll_print_options);
+
+static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr)
+{
+ const char *end;
+
+ if (!strchr(str, ':') &&
+ in4_pton(str, -1, (void *)addr, -1, &end) > 0) {
+ if (!*end)
+ return 0;
+ }
+ if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) {
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!*end)
+ return 1;
+#else
+ return -1;
+#endif
+ }
+ return -1;
+}
+
+int netpoll_parse_options(struct netpoll *np, char *opt)
+{
+ char *cur=opt, *delim;
+ int ipv6;
+ bool ipversion_set = false;
+
+ if (*cur != '@') {
+ if ((delim = strchr(cur, '@')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ if (kstrtou16(cur, 10, &np->local_port))
+ goto parse_failed;
+ cur = delim;
+ }
+ cur++;
+
+ if (*cur != '/') {
+ ipversion_set = true;
+ if ((delim = strchr(cur, '/')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip);
+ if (ipv6 < 0)
+ goto parse_failed;
+ else
+ np->ipv6 = (bool)ipv6;
+ cur = delim;
+ }
+ cur++;
+
+ if (*cur != ',') {
+ /* parse out dev name */
+ if ((delim = strchr(cur, ',')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ strlcpy(np->dev_name, cur, sizeof(np->dev_name));
+ cur = delim;
+ }
+ cur++;
+
+ if (*cur != '@') {
+ /* dst port */
+ if ((delim = strchr(cur, '@')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ if (*cur == ' ' || *cur == '\t')
+ np_info(np, "warning: whitespace is not allowed\n");
+ if (kstrtou16(cur, 10, &np->remote_port))
+ goto parse_failed;
+ cur = delim;
+ }
+ cur++;
+
+ /* dst ip */
+ if ((delim = strchr(cur, '/')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip);
+ if (ipv6 < 0)
+ goto parse_failed;
+ else if (ipversion_set && np->ipv6 != (bool)ipv6)
+ goto parse_failed;
+ else
+ np->ipv6 = (bool)ipv6;
+ cur = delim + 1;
+
+ if (*cur != 0) {
+ /* MAC address */
+ if (!mac_pton(cur, np->remote_mac))
+ goto parse_failed;
+ }
+
+ netpoll_print_options(np);
+
+ return 0;
+
+ parse_failed:
+ np_info(np, "couldn't parse config at '%s'!\n", cur);
+ return -1;
+}
+EXPORT_SYMBOL(netpoll_parse_options);
+
+int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
+{
+ struct netpoll_info *npinfo;
+ const struct net_device_ops *ops;
+ int err;
+
+ np->dev = ndev;
+ strlcpy(np->dev_name, ndev->name, IFNAMSIZ);
+ INIT_WORK(&np->cleanup_work, netpoll_async_cleanup);
+
+ if (ndev->priv_flags & IFF_DISABLE_NETPOLL) {
+ np_err(np, "%s doesn't support polling, aborting\n",
+ np->dev_name);
+ err = -ENOTSUPP;
+ goto out;
+ }
+
+ if (!ndev->npinfo) {
+ npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
+ if (!npinfo) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ sema_init(&npinfo->dev_lock, 1);
+ skb_queue_head_init(&npinfo->txq);
+ INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
+
+ refcount_set(&npinfo->refcnt, 1);
+
+ ops = np->dev->netdev_ops;
+ if (ops->ndo_netpoll_setup) {
+ err = ops->ndo_netpoll_setup(ndev, npinfo);
+ if (err)
+ goto free_npinfo;
+ }
+ } else {
+ npinfo = rtnl_dereference(ndev->npinfo);
+ refcount_inc(&npinfo->refcnt);
+ }
+
+ npinfo->netpoll = np;
+
+ /* last thing to do is link it to the net device structure */
+ rcu_assign_pointer(ndev->npinfo, npinfo);
+
+ return 0;
+
+free_npinfo:
+ kfree(npinfo);
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(__netpoll_setup);
+
+int netpoll_setup(struct netpoll *np)
+{
+ struct net_device *ndev = NULL, *dev = NULL;
+ struct net *net = current->nsproxy->net_ns;
+ struct in_device *in_dev;
+ int err;
+
+ rtnl_lock();
+ if (np->dev_name[0])
+ ndev = __dev_get_by_name(net, np->dev_name);
+
+ if (!ndev) {
+ np_err(np, "%s doesn't exist, aborting\n", np->dev_name);
+ err = -ENODEV;
+ goto unlock;
+ }
+ dev_hold(ndev);
+
+ /* bring up DSA management network devices up first */
+ for_each_netdev(net, dev) {
+ if (!netdev_uses_dsa(dev))
+ continue;
+
+ err = dev_change_flags(dev, dev->flags | IFF_UP);
+ if (err < 0) {
+ np_err(np, "%s failed to open %s\n",
+ np->dev_name, dev->name);
+ goto put;
+ }
+ }
+
+ if (netdev_master_upper_dev_get(ndev)) {
+ np_err(np, "%s is a slave device, aborting\n", np->dev_name);
+ err = -EBUSY;
+ goto put;
+ }
+
+ if (!netif_running(ndev)) {
+ unsigned long atmost, atleast;
+
+ np_info(np, "device %s not up yet, forcing it\n", np->dev_name);
+
+ err = dev_open(ndev);
+
+ if (err) {
+ np_err(np, "failed to open %s\n", ndev->name);
+ goto put;
+ }
+
+ rtnl_unlock();
+ atleast = jiffies + HZ/10;
+ atmost = jiffies + carrier_timeout * HZ;
+ while (!netif_carrier_ok(ndev)) {
+ if (time_after(jiffies, atmost)) {
+ np_notice(np, "timeout waiting for carrier\n");
+ break;
+ }
+ msleep(1);
+ }
+
+ /* If carrier appears to come up instantly, we don't
+ * trust it and pause so that we don't pump all our
+ * queued console messages into the bitbucket.
+ */
+
+ if (time_before(jiffies, atleast)) {
+ np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n");
+ msleep(4000);
+ }
+ rtnl_lock();
+ }
+
+ if (!np->local_ip.ip) {
+ if (!np->ipv6) {
+ in_dev = __in_dev_get_rtnl(ndev);
+
+ if (!in_dev || !in_dev->ifa_list) {
+ np_err(np, "no IP address for %s, aborting\n",
+ np->dev_name);
+ err = -EDESTADDRREQ;
+ goto put;
+ }
+
+ np->local_ip.ip = in_dev->ifa_list->ifa_local;
+ np_info(np, "local IP %pI4\n", &np->local_ip.ip);
+ } else {
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet6_dev *idev;
+
+ err = -EDESTADDRREQ;
+ idev = __in6_dev_get(ndev);
+ if (idev) {
+ struct inet6_ifaddr *ifp;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)
+ continue;
+ np->local_ip.in6 = ifp->addr;
+ err = 0;
+ break;
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ if (err) {
+ np_err(np, "no IPv6 address for %s, aborting\n",
+ np->dev_name);
+ goto put;
+ } else
+ np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6);
+#else
+ np_err(np, "IPv6 is not supported %s, aborting\n",
+ np->dev_name);
+ err = -EINVAL;
+ goto put;
+#endif
+ }
+ }
+
+ /* fill up the skb queue */
+ refill_skbs();
+
+ err = __netpoll_setup(np, ndev);
+ if (err)
+ goto put;
+
+ rtnl_unlock();
+ return 0;
+
+put:
+ dev_put(ndev);
+unlock:
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(netpoll_setup);
+
+static int __init netpoll_init(void)
+{
+ skb_queue_head_init(&skb_pool);
+ return 0;
+}
+core_initcall(netpoll_init);
+
+static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
+{
+ struct netpoll_info *npinfo =
+ container_of(rcu_head, struct netpoll_info, rcu);
+
+ skb_queue_purge(&npinfo->txq);
+
+ /* we can't call cancel_delayed_work_sync here, as we are in softirq */
+ cancel_delayed_work(&npinfo->tx_work);
+
+ /* clean after last, unfinished work */
+ __skb_queue_purge(&npinfo->txq);
+ /* now cancel it again */
+ cancel_delayed_work(&npinfo->tx_work);
+ kfree(npinfo);
+}
+
+void __netpoll_cleanup(struct netpoll *np)
+{
+ struct netpoll_info *npinfo;
+
+ /* rtnl_dereference would be preferable here but
+ * rcu_cleanup_netpoll path can put us in here safely without
+ * holding the rtnl, so plain rcu_dereference it is
+ */
+ npinfo = rtnl_dereference(np->dev->npinfo);
+ if (!npinfo)
+ return;
+
+ synchronize_srcu(&netpoll_srcu);
+
+ if (refcount_dec_and_test(&npinfo->refcnt)) {
+ const struct net_device_ops *ops;
+
+ ops = np->dev->netdev_ops;
+ if (ops->ndo_netpoll_cleanup)
+ ops->ndo_netpoll_cleanup(np->dev);
+
+ RCU_INIT_POINTER(np->dev->npinfo, NULL);
+ call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);
+ } else
+ RCU_INIT_POINTER(np->dev->npinfo, NULL);
+}
+EXPORT_SYMBOL_GPL(__netpoll_cleanup);
+
+static void netpoll_async_cleanup(struct work_struct *work)
+{
+ struct netpoll *np = container_of(work, struct netpoll, cleanup_work);
+
+ rtnl_lock();
+ __netpoll_cleanup(np);
+ rtnl_unlock();
+ kfree(np);
+}
+
+void __netpoll_free_async(struct netpoll *np)
+{
+ schedule_work(&np->cleanup_work);
+}
+EXPORT_SYMBOL_GPL(__netpoll_free_async);
+
+void netpoll_cleanup(struct netpoll *np)
+{
+ rtnl_lock();
+ if (!np->dev)
+ goto out;
+ __netpoll_cleanup(np);
+ dev_put(np->dev);
+ np->dev = NULL;
+out:
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(netpoll_cleanup);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
new file mode 100644
index 000000000..239786608
--- /dev/null
+++ b/net/core/netprio_cgroup.c
@@ -0,0 +1,308 @@
+/*
+ * net/core/netprio_cgroup.c Priority Control Group
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/cgroup.h>
+#include <linux/rcupdate.h>
+#include <linux/atomic.h>
+#include <linux/sched/task.h>
+
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+#include <net/netprio_cgroup.h>
+
+#include <linux/fdtable.h>
+
+/*
+ * netprio allocates per-net_device priomap array which is indexed by
+ * css->id. Limiting css ID to 16bits doesn't lose anything.
+ */
+#define NETPRIO_ID_MAX USHRT_MAX
+
+#define PRIOMAP_MIN_SZ 128
+
+/*
+ * Extend @dev->priomap so that it's large enough to accommodate
+ * @target_idx. @dev->priomap.priomap_len > @target_idx after successful
+ * return. Must be called under rtnl lock.
+ */
+static int extend_netdev_table(struct net_device *dev, u32 target_idx)
+{
+ struct netprio_map *old, *new;
+ size_t new_sz, new_len;
+
+ /* is the existing priomap large enough? */
+ old = rtnl_dereference(dev->priomap);
+ if (old && old->priomap_len > target_idx)
+ return 0;
+
+ /*
+ * Determine the new size. Let's keep it power-of-two. We start
+ * from PRIOMAP_MIN_SZ and double it until it's large enough to
+ * accommodate @target_idx.
+ */
+ new_sz = PRIOMAP_MIN_SZ;
+ while (true) {
+ new_len = (new_sz - offsetof(struct netprio_map, priomap)) /
+ sizeof(new->priomap[0]);
+ if (new_len > target_idx)
+ break;
+ new_sz *= 2;
+ /* overflowed? */
+ if (WARN_ON(new_sz < PRIOMAP_MIN_SZ))
+ return -ENOSPC;
+ }
+
+ /* allocate & copy */
+ new = kzalloc(new_sz, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ if (old)
+ memcpy(new->priomap, old->priomap,
+ old->priomap_len * sizeof(old->priomap[0]));
+
+ new->priomap_len = new_len;
+
+ /* install the new priomap */
+ rcu_assign_pointer(dev->priomap, new);
+ if (old)
+ kfree_rcu(old, rcu);
+ return 0;
+}
+
+/**
+ * netprio_prio - return the effective netprio of a cgroup-net_device pair
+ * @css: css part of the target pair
+ * @dev: net_device part of the target pair
+ *
+ * Should be called under RCU read or rtnl lock.
+ */
+static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
+{
+ struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
+ int id = css->cgroup->id;
+
+ if (map && id < map->priomap_len)
+ return map->priomap[id];
+ return 0;
+}
+
+/**
+ * netprio_set_prio - set netprio on a cgroup-net_device pair
+ * @css: css part of the target pair
+ * @dev: net_device part of the target pair
+ * @prio: prio to set
+ *
+ * Set netprio to @prio on @css-@dev pair. Should be called under rtnl
+ * lock and may fail under memory pressure for non-zero @prio.
+ */
+static int netprio_set_prio(struct cgroup_subsys_state *css,
+ struct net_device *dev, u32 prio)
+{
+ struct netprio_map *map;
+ int id = css->cgroup->id;
+ int ret;
+
+ /* avoid extending priomap for zero writes */
+ map = rtnl_dereference(dev->priomap);
+ if (!prio && (!map || map->priomap_len <= id))
+ return 0;
+
+ ret = extend_netdev_table(dev, id);
+ if (ret)
+ return ret;
+
+ map = rtnl_dereference(dev->priomap);
+ map->priomap[id] = prio;
+ return 0;
+}
+
+static struct cgroup_subsys_state *
+cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_subsys_state *css;
+
+ css = kzalloc(sizeof(*css), GFP_KERNEL);
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static int cgrp_css_online(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *parent_css = css->parent;
+ struct net_device *dev;
+ int ret = 0;
+
+ if (css->id > NETPRIO_ID_MAX)
+ return -ENOSPC;
+
+ if (!parent_css)
+ return 0;
+
+ rtnl_lock();
+ /*
+ * Inherit prios from the parent. As all prios are set during
+ * onlining, there is no need to clear them on offline.
+ */
+ for_each_netdev(&init_net, dev) {
+ u32 prio = netprio_prio(parent_css, dev);
+
+ ret = netprio_set_prio(css, dev, prio);
+ if (ret)
+ break;
+ }
+ rtnl_unlock();
+ return ret;
+}
+
+static void cgrp_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css);
+}
+
+static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return css->cgroup->id;
+}
+
+static int read_priomap(struct seq_file *sf, void *v)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev)
+ seq_printf(sf, "%s %u\n", dev->name,
+ netprio_prio(seq_css(sf), dev));
+ rcu_read_unlock();
+ return 0;
+}
+
+static ssize_t write_priomap(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ char devname[IFNAMSIZ + 1];
+ struct net_device *dev;
+ u32 prio;
+ int ret;
+
+ if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
+ return -EINVAL;
+
+ dev = dev_get_by_name(&init_net, devname);
+ if (!dev)
+ return -ENODEV;
+
+ cgroup_sk_alloc_disable();
+
+ rtnl_lock();
+
+ ret = netprio_set_prio(of_css(of), dev, prio);
+
+ rtnl_unlock();
+ dev_put(dev);
+ return ret ?: nbytes;
+}
+
+static int update_netprio(const void *v, struct file *file, unsigned n)
+{
+ int err;
+ struct socket *sock = sock_from_file(file, &err);
+ if (sock) {
+ spin_lock(&cgroup_sk_update_lock);
+ sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
+ (unsigned long)v);
+ spin_unlock(&cgroup_sk_update_lock);
+ }
+ return 0;
+}
+
+static void net_prio_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *p;
+ struct cgroup_subsys_state *css;
+
+ cgroup_sk_alloc_disable();
+
+ cgroup_taskset_for_each(p, css, tset) {
+ void *v = (void *)(unsigned long)css->cgroup->id;
+
+ task_lock(p);
+ iterate_fd(p->files, 0, update_netprio, v);
+ task_unlock(p);
+ }
+}
+
+static struct cftype ss_files[] = {
+ {
+ .name = "prioidx",
+ .read_u64 = read_prioidx,
+ },
+ {
+ .name = "ifpriomap",
+ .seq_show = read_priomap,
+ .write = write_priomap,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys net_prio_cgrp_subsys = {
+ .css_alloc = cgrp_css_alloc,
+ .css_online = cgrp_css_online,
+ .css_free = cgrp_css_free,
+ .attach = net_prio_attach,
+ .legacy_cftypes = ss_files,
+};
+
+static int netprio_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netprio_map *old;
+
+ /*
+ * Note this is called with rtnl_lock held so we have update side
+ * protection on our rcu assignments
+ */
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ old = rtnl_dereference(dev->priomap);
+ RCU_INIT_POINTER(dev->priomap, NULL);
+ if (old)
+ kfree_rcu(old, rcu);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block netprio_device_notifier = {
+ .notifier_call = netprio_device_event
+};
+
+static int __init init_cgroup_netprio(void)
+{
+ register_netdevice_notifier(&netprio_device_notifier);
+ return 0;
+}
+
+subsys_initcall(init_cgroup_netprio);
+MODULE_LICENSE("GPL v2");
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
new file mode 100644
index 000000000..43a932cb6
--- /dev/null
+++ b/net/core/page_pool.c
@@ -0,0 +1,317 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * page_pool.c
+ * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
+ * Copyright (C) 2016 Red Hat, Inc.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include <net/page_pool.h>
+#include <linux/dma-direction.h>
+#include <linux/dma-mapping.h>
+#include <linux/page-flags.h>
+#include <linux/mm.h> /* for __put_page() */
+
+static int page_pool_init(struct page_pool *pool,
+ const struct page_pool_params *params)
+{
+ unsigned int ring_qsize = 1024; /* Default */
+
+ memcpy(&pool->p, params, sizeof(pool->p));
+
+ /* Validate only known flags were used */
+ if (pool->p.flags & ~(PP_FLAG_ALL))
+ return -EINVAL;
+
+ if (pool->p.pool_size)
+ ring_qsize = pool->p.pool_size;
+
+ /* Sanity limit mem that can be pinned down */
+ if (ring_qsize > 32768)
+ return -E2BIG;
+
+ /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
+ * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
+ * which is the XDP_TX use-case.
+ */
+ if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
+ (pool->p.dma_dir != DMA_BIDIRECTIONAL))
+ return -EINVAL;
+
+ if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
+struct page_pool *page_pool_create(const struct page_pool_params *params)
+{
+ struct page_pool *pool;
+ int err = 0;
+
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
+ if (!pool)
+ return ERR_PTR(-ENOMEM);
+
+ err = page_pool_init(pool, params);
+ if (err < 0) {
+ pr_warn("%s() gave up with errno %d\n", __func__, err);
+ kfree(pool);
+ return ERR_PTR(err);
+ }
+ return pool;
+}
+EXPORT_SYMBOL(page_pool_create);
+
+/* fast path */
+static struct page *__page_pool_get_cached(struct page_pool *pool)
+{
+ struct ptr_ring *r = &pool->ring;
+ struct page *page;
+
+ /* Quicker fallback, avoid locks when ring is empty */
+ if (__ptr_ring_empty(r))
+ return NULL;
+
+ /* Test for safe-context, caller should provide this guarantee */
+ if (likely(in_serving_softirq())) {
+ if (likely(pool->alloc.count)) {
+ /* Fast-path */
+ page = pool->alloc.cache[--pool->alloc.count];
+ return page;
+ }
+ /* Slower-path: Alloc array empty, time to refill
+ *
+ * Open-coded bulk ptr_ring consumer.
+ *
+ * Discussion: the ring consumer lock is not really
+ * needed due to the softirq/NAPI protection, but
+ * later need the ability to reclaim pages on the
+ * ring. Thus, keeping the locks.
+ */
+ spin_lock(&r->consumer_lock);
+ while ((page = __ptr_ring_consume(r))) {
+ if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
+ break;
+ pool->alloc.cache[pool->alloc.count++] = page;
+ }
+ spin_unlock(&r->consumer_lock);
+ return page;
+ }
+
+ /* Slow-path: Get page from locked ring queue */
+ page = ptr_ring_consume(&pool->ring);
+ return page;
+}
+
+/* slow path */
+noinline
+static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
+ gfp_t _gfp)
+{
+ struct page *page;
+ gfp_t gfp = _gfp;
+ dma_addr_t dma;
+
+ /* We could always set __GFP_COMP, and avoid this branch, as
+ * prep_new_page() can handle order-0 with __GFP_COMP.
+ */
+ if (pool->p.order)
+ gfp |= __GFP_COMP;
+
+ /* FUTURE development:
+ *
+ * Current slow-path essentially falls back to single page
+ * allocations, which doesn't improve performance. This code
+ * need bulk allocation support from the page allocator code.
+ */
+
+ /* Cache was empty, do real allocation */
+ page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
+ if (!page)
+ return NULL;
+
+ if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ goto skip_dma_map;
+
+ /* Setup DMA mapping: use page->private for DMA-addr
+ * This mapping is kept for lifetime of page, until leaving pool.
+ */
+ dma = dma_map_page(pool->p.dev, page, 0,
+ (PAGE_SIZE << pool->p.order),
+ pool->p.dma_dir);
+ if (dma_mapping_error(pool->p.dev, dma)) {
+ put_page(page);
+ return NULL;
+ }
+ set_page_private(page, dma); /* page->private = dma; */
+
+skip_dma_map:
+ /* When page just alloc'ed is should/must have refcnt 1. */
+ return page;
+}
+
+/* For using page_pool replace: alloc_pages() API calls, but provide
+ * synchronization guarantee for allocation side.
+ */
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
+{
+ struct page *page;
+
+ /* Fast-path: Get a page from cache */
+ page = __page_pool_get_cached(pool);
+ if (page)
+ return page;
+
+ /* Slow-path: cache empty, do real allocation */
+ page = __page_pool_alloc_pages_slow(pool, gfp);
+ return page;
+}
+EXPORT_SYMBOL(page_pool_alloc_pages);
+
+/* Cleanup page_pool state from page */
+static void __page_pool_clean_page(struct page_pool *pool,
+ struct page *page)
+{
+ if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ return;
+
+ /* DMA unmap */
+ dma_unmap_page(pool->p.dev, page_private(page),
+ PAGE_SIZE << pool->p.order, pool->p.dma_dir);
+ set_page_private(page, 0);
+}
+
+/* Return a page to the page allocator, cleaning up our state */
+static void __page_pool_return_page(struct page_pool *pool, struct page *page)
+{
+ __page_pool_clean_page(pool, page);
+ put_page(page);
+ /* An optimization would be to call __free_pages(page, pool->p.order)
+ * knowing page is not part of page-cache (thus avoiding a
+ * __page_cache_release() call).
+ */
+}
+
+static bool __page_pool_recycle_into_ring(struct page_pool *pool,
+ struct page *page)
+{
+ int ret;
+ /* BH protection not needed if current is serving softirq */
+ if (in_serving_softirq())
+ ret = ptr_ring_produce(&pool->ring, page);
+ else
+ ret = ptr_ring_produce_bh(&pool->ring, page);
+
+ return (ret == 0) ? true : false;
+}
+
+/* Only allow direct recycling in special circumstances, into the
+ * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case.
+ *
+ * Caller must provide appropriate safe context.
+ */
+static bool __page_pool_recycle_direct(struct page *page,
+ struct page_pool *pool)
+{
+ if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
+ return false;
+
+ /* Caller MUST have verified/know (page_ref_count(page) == 1) */
+ pool->alloc.cache[pool->alloc.count++] = page;
+ return true;
+}
+
+void __page_pool_put_page(struct page_pool *pool,
+ struct page *page, bool allow_direct)
+{
+ /* This allocator is optimized for the XDP mode that uses
+ * one-frame-per-page, but have fallbacks that act like the
+ * regular page allocator APIs.
+ *
+ * refcnt == 1 means page_pool owns page, and can recycle it.
+ */
+ if (likely(page_ref_count(page) == 1)) {
+ /* Read barrier done in page_ref_count / READ_ONCE */
+
+ if (allow_direct && in_serving_softirq())
+ if (__page_pool_recycle_direct(page, pool))
+ return;
+
+ if (!__page_pool_recycle_into_ring(pool, page)) {
+ /* Cache full, fallback to free pages */
+ __page_pool_return_page(pool, page);
+ }
+ return;
+ }
+ /* Fallback/non-XDP mode: API user have elevated refcnt.
+ *
+ * Many drivers split up the page into fragments, and some
+ * want to keep doing this to save memory and do refcnt based
+ * recycling. Support this use case too, to ease drivers
+ * switching between XDP/non-XDP.
+ *
+ * In-case page_pool maintains the DMA mapping, API user must
+ * call page_pool_put_page once. In this elevated refcnt
+ * case, the DMA is unmapped/released, as driver is likely
+ * doing refcnt based recycle tricks, meaning another process
+ * will be invoking put_page.
+ */
+ __page_pool_clean_page(pool, page);
+ put_page(page);
+}
+EXPORT_SYMBOL(__page_pool_put_page);
+
+static void __page_pool_empty_ring(struct page_pool *pool)
+{
+ struct page *page;
+
+ /* Empty recycle ring */
+ while ((page = ptr_ring_consume_bh(&pool->ring))) {
+ /* Verify the refcnt invariant of cached pages */
+ if (!(page_ref_count(page) == 1))
+ pr_crit("%s() page_pool refcnt %d violation\n",
+ __func__, page_ref_count(page));
+
+ __page_pool_return_page(pool, page);
+ }
+}
+
+static void __page_pool_destroy_rcu(struct rcu_head *rcu)
+{
+ struct page_pool *pool;
+
+ pool = container_of(rcu, struct page_pool, rcu);
+
+ WARN(pool->alloc.count, "API usage violation");
+
+ __page_pool_empty_ring(pool);
+ ptr_ring_cleanup(&pool->ring, NULL);
+ kfree(pool);
+}
+
+/* Cleanup and release resources */
+void page_pool_destroy(struct page_pool *pool)
+{
+ struct page *page;
+
+ /* Empty alloc cache, assume caller made sure this is
+ * no-longer in use, and page_pool_alloc_pages() cannot be
+ * call concurrently.
+ */
+ while (pool->alloc.count) {
+ page = pool->alloc.cache[--pool->alloc.count];
+ __page_pool_return_page(pool, page);
+ }
+
+ /* No more consumers should exist, but producers could still
+ * be in-flight.
+ */
+ __page_pool_empty_ring(pool);
+
+ /* An xdp_mem_allocator can still ref page_pool pointer */
+ call_rcu(&pool->rcu, __page_pool_destroy_rcu);
+}
+EXPORT_SYMBOL(page_pool_destroy);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
new file mode 100644
index 000000000..3714cd9e3
--- /dev/null
+++ b/net/core/pktgen.c
@@ -0,0 +1,3903 @@
+/*
+ * Authors:
+ * Copyright 2001, 2002 by Robert Olsson <robert.olsson@its.uu.se>
+ * Uppsala University and
+ * Swedish University of Agricultural Sciences
+ *
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ * Ben Greear <greearb@candelatech.com>
+ * Jens Låås <jens.laas@data.slu.se>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ *
+ * A tool for loading the network with preconfigurated packets.
+ * The tool is implemented as a linux module. Parameters are output
+ * device, delay (to hard_xmit), number of packets, and whether
+ * to use multiple SKBs or just the same one.
+ * pktgen uses the installed interface's output routine.
+ *
+ * Additional hacking by:
+ *
+ * Jens.Laas@data.slu.se
+ * Improved by ANK. 010120.
+ * Improved by ANK even more. 010212.
+ * MAC address typo fixed. 010417 --ro
+ * Integrated. 020301 --DaveM
+ * Added multiskb option 020301 --DaveM
+ * Scaling of results. 020417--sigurdur@linpro.no
+ * Significant re-work of the module:
+ * * Convert to threaded model to more efficiently be able to transmit
+ * and receive on multiple interfaces at once.
+ * * Converted many counters to __u64 to allow longer runs.
+ * * Allow configuration of ranges, like min/max IP address, MACs,
+ * and UDP-ports, for both source and destination, and can
+ * set to use a random distribution or sequentially walk the range.
+ * * Can now change most values after starting.
+ * * Place 12-byte packet in UDP payload with magic number,
+ * sequence number, and timestamp.
+ * * Add receiver code that detects dropped pkts, re-ordered pkts, and
+ * latencies (with micro-second) precision.
+ * * Add IOCTL interface to easily get counters & configuration.
+ * --Ben Greear <greearb@candelatech.com>
+ *
+ * Renamed multiskb to clone_skb and cleaned up sending core for two distinct
+ * skb modes. A clone_skb=0 mode for Ben "ranges" work and a clone_skb != 0
+ * as a "fastpath" with a configurable number of clones after alloc's.
+ * clone_skb=0 means all packets are allocated this also means ranges time
+ * stamps etc can be used. clone_skb=100 means 1 malloc is followed by 100
+ * clones.
+ *
+ * Also moved to /proc/net/pktgen/
+ * --ro
+ *
+ * Sept 10: Fixed threading/locking. Lots of bone-headed and more clever
+ * mistakes. Also merged in DaveM's patch in the -pre6 patch.
+ * --Ben Greear <greearb@candelatech.com>
+ *
+ * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br)
+ *
+ *
+ * 021124 Finished major redesign and rewrite for new functionality.
+ * See Documentation/networking/pktgen.txt for how to use this.
+ *
+ * The new operation:
+ * For each CPU one thread/process is created at start. This process checks
+ * for running devices in the if_list and sends packets until count is 0 it
+ * also the thread checks the thread->control which is used for inter-process
+ * communication. controlling process "posts" operations to the threads this
+ * way.
+ * The if_list is RCU protected, and the if_lock remains to protect updating
+ * of if_list, from "add_device" as it invoked from userspace (via proc write).
+ *
+ * By design there should only be *one* "controlling" process. In practice
+ * multiple write accesses gives unpredictable result. Understood by "write"
+ * to /proc gives result code thats should be read be the "writer".
+ * For practical use this should be no problem.
+ *
+ * Note when adding devices to a specific CPU there good idea to also assign
+ * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU.
+ * --ro
+ *
+ * Fix refcount off by one if first packet fails, potential null deref,
+ * memleak 030710- KJP
+ *
+ * First "ranges" functionality for ipv6 030726 --ro
+ *
+ * Included flow support. 030802 ANK.
+ *
+ * Fixed unaligned access on IA-64 Grant Grundler <grundler@parisc-linux.org>
+ *
+ * Remove if fix from added Harald Welte <laforge@netfilter.org> 040419
+ * ia64 compilation fix from Aron Griffis <aron@hp.com> 040604
+ *
+ * New xmit() return, do_div and misc clean up by Stephen Hemminger
+ * <shemminger@osdl.org> 040923
+ *
+ * Randy Dunlap fixed u64 printk compiler warning
+ *
+ * Remove FCS from BW calculation. Lennert Buytenhek <buytenh@wantstofly.org>
+ * New time handling. Lennert Buytenhek <buytenh@wantstofly.org> 041213
+ *
+ * Corrections from Nikolai Malykh (nmalykh@bilim.com)
+ * Removed unused flags F_SET_SRCMAC & F_SET_SRCIP 041230
+ *
+ * interruptible_sleep_on_timeout() replaced Nishanth Aravamudan <nacc@us.ibm.com>
+ * 050103
+ *
+ * MPLS support by Steven Whitehouse <steve@chygwyn.com>
+ *
+ * 802.1Q/Q-in-Q support by Francesco Fondelli (FF) <francesco.fondelli@gmail.com>
+ *
+ * Fixed src_mac command to set source mac of packet to value specified in
+ * command by Adit Ranadive <adit.262@gmail.com>
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/sys.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/unistd.h>
+#include <linux/string.h>
+#include <linux/ptrace.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/hrtimer.h>
+#include <linux/freezer.h>
+#include <linux/delay.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/wait.h>
+#include <linux/etherdevice.h>
+#include <linux/kthread.h>
+#include <linux/prefetch.h>
+#include <net/net_namespace.h>
+#include <net/checksum.h>
+#include <net/ipv6.h>
+#include <net/udp.h>
+#include <net/ip6_checksum.h>
+#include <net/addrconf.h>
+#ifdef CONFIG_XFRM
+#include <net/xfrm.h>
+#endif
+#include <net/netns/generic.h>
+#include <asm/byteorder.h>
+#include <linux/rcupdate.h>
+#include <linux/bitops.h>
+#include <linux/io.h>
+#include <linux/timex.h>
+#include <linux/uaccess.h>
+#include <asm/dma.h>
+#include <asm/div64.h> /* do_div */
+
+#define VERSION "2.75"
+#define IP_NAME_SZ 32
+#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */
+#define MPLS_STACK_BOTTOM htonl(0x00000100)
+
+#define func_enter() pr_debug("entering %s\n", __func__);
+
+#define PKT_FLAGS \
+ pf(IPV6) /* Interface in IPV6 Mode */ \
+ pf(IPSRC_RND) /* IP-Src Random */ \
+ pf(IPDST_RND) /* IP-Dst Random */ \
+ pf(TXSIZE_RND) /* Transmit size is random */ \
+ pf(UDPSRC_RND) /* UDP-Src Random */ \
+ pf(UDPDST_RND) /* UDP-Dst Random */ \
+ pf(UDPCSUM) /* Include UDP checksum */ \
+ pf(NO_TIMESTAMP) /* Don't timestamp packets (default TS) */ \
+ pf(MPLS_RND) /* Random MPLS labels */ \
+ pf(QUEUE_MAP_RND) /* queue map Random */ \
+ pf(QUEUE_MAP_CPU) /* queue map mirrors smp_processor_id() */ \
+ pf(FLOW_SEQ) /* Sequential flows */ \
+ pf(IPSEC) /* ipsec on for flows */ \
+ pf(MACSRC_RND) /* MAC-Src Random */ \
+ pf(MACDST_RND) /* MAC-Dst Random */ \
+ pf(VID_RND) /* Random VLAN ID */ \
+ pf(SVID_RND) /* Random SVLAN ID */ \
+ pf(NODE) /* Node memory alloc*/ \
+
+#define pf(flag) flag##_SHIFT,
+enum pkt_flags {
+ PKT_FLAGS
+};
+#undef pf
+
+/* Device flag bits */
+#define pf(flag) static const __u32 F_##flag = (1<<flag##_SHIFT);
+PKT_FLAGS
+#undef pf
+
+#define pf(flag) __stringify(flag),
+static char *pkt_flag_names[] = {
+ PKT_FLAGS
+};
+#undef pf
+
+#define NR_PKT_FLAGS ARRAY_SIZE(pkt_flag_names)
+
+/* Thread control flag bits */
+#define T_STOP (1<<0) /* Stop run */
+#define T_RUN (1<<1) /* Start run */
+#define T_REMDEVALL (1<<2) /* Remove all devs */
+#define T_REMDEV (1<<3) /* Remove one dev */
+
+/* Xmit modes */
+#define M_START_XMIT 0 /* Default normal TX */
+#define M_NETIF_RECEIVE 1 /* Inject packets into stack */
+#define M_QUEUE_XMIT 2 /* Inject packet into qdisc */
+
+/* If lock -- protects updating of if_list */
+#define if_lock(t) mutex_lock(&(t->if_lock));
+#define if_unlock(t) mutex_unlock(&(t->if_lock));
+
+/* Used to help with determining the pkts on receive */
+#define PKTGEN_MAGIC 0xbe9be955
+#define PG_PROC_DIR "pktgen"
+#define PGCTRL "pgctrl"
+
+#define MAX_CFLOWS 65536
+
+#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4)
+#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4)
+
+struct flow_state {
+ __be32 cur_daddr;
+ int count;
+#ifdef CONFIG_XFRM
+ struct xfrm_state *x;
+#endif
+ __u32 flags;
+};
+
+/* flow flag bits */
+#define F_INIT (1<<0) /* flow has been initialized */
+
+struct pktgen_dev {
+ /*
+ * Try to keep frequent/infrequent used vars. separated.
+ */
+ struct proc_dir_entry *entry; /* proc file */
+ struct pktgen_thread *pg_thread;/* the owner */
+ struct list_head list; /* chaining in the thread's run-queue */
+ struct rcu_head rcu; /* freed by RCU */
+
+ int running; /* if false, the test will stop */
+
+ /* If min != max, then we will either do a linear iteration, or
+ * we will do a random selection from within the range.
+ */
+ __u32 flags;
+ int xmit_mode;
+ int min_pkt_size;
+ int max_pkt_size;
+ int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
+ int nfrags;
+ int removal_mark; /* non-zero => the device is marked for
+ * removal by worker thread */
+
+ struct page *page;
+ u64 delay; /* nano-seconds */
+
+ __u64 count; /* Default No packets to send */
+ __u64 sofar; /* How many pkts we've sent so far */
+ __u64 tx_bytes; /* How many bytes we've transmitted */
+ __u64 errors; /* Errors when trying to transmit, */
+
+ /* runtime counters relating to clone_skb */
+
+ __u32 clone_count;
+ int last_ok; /* Was last skb sent?
+ * Or a failed transmit of some sort?
+ * This will keep sequence numbers in order
+ */
+ ktime_t next_tx;
+ ktime_t started_at;
+ ktime_t stopped_at;
+ u64 idle_acc; /* nano-seconds */
+
+ __u32 seq_num;
+
+ int clone_skb; /*
+ * Use multiple SKBs during packet gen.
+ * If this number is greater than 1, then
+ * that many copies of the same packet will be
+ * sent before a new packet is allocated.
+ * If you want to send 1024 identical packets
+ * before creating a new packet,
+ * set clone_skb to 1024.
+ */
+
+ char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+ char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+ char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+ char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+
+ struct in6_addr in6_saddr;
+ struct in6_addr in6_daddr;
+ struct in6_addr cur_in6_daddr;
+ struct in6_addr cur_in6_saddr;
+ /* For ranges */
+ struct in6_addr min_in6_daddr;
+ struct in6_addr max_in6_daddr;
+ struct in6_addr min_in6_saddr;
+ struct in6_addr max_in6_saddr;
+
+ /* If we're doing ranges, random or incremental, then this
+ * defines the min/max for those ranges.
+ */
+ __be32 saddr_min; /* inclusive, source IP address */
+ __be32 saddr_max; /* exclusive, source IP address */
+ __be32 daddr_min; /* inclusive, dest IP address */
+ __be32 daddr_max; /* exclusive, dest IP address */
+
+ __u16 udp_src_min; /* inclusive, source UDP port */
+ __u16 udp_src_max; /* exclusive, source UDP port */
+ __u16 udp_dst_min; /* inclusive, dest UDP port */
+ __u16 udp_dst_max; /* exclusive, dest UDP port */
+
+ /* DSCP + ECN */
+ __u8 tos; /* six MSB of (former) IPv4 TOS
+ are for dscp codepoint */
+ __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6
+ (see RFC 3260, sec. 4) */
+
+ /* MPLS */
+ unsigned int nr_labels; /* Depth of stack, 0 = no MPLS */
+ __be32 labels[MAX_MPLS_LABELS];
+
+ /* VLAN/SVLAN (802.1Q/Q-in-Q) */
+ __u8 vlan_p;
+ __u8 vlan_cfi;
+ __u16 vlan_id; /* 0xffff means no vlan tag */
+
+ __u8 svlan_p;
+ __u8 svlan_cfi;
+ __u16 svlan_id; /* 0xffff means no svlan tag */
+
+ __u32 src_mac_count; /* How many MACs to iterate through */
+ __u32 dst_mac_count; /* How many MACs to iterate through */
+
+ unsigned char dst_mac[ETH_ALEN];
+ unsigned char src_mac[ETH_ALEN];
+
+ __u32 cur_dst_mac_offset;
+ __u32 cur_src_mac_offset;
+ __be32 cur_saddr;
+ __be32 cur_daddr;
+ __u16 ip_id;
+ __u16 cur_udp_dst;
+ __u16 cur_udp_src;
+ __u16 cur_queue_map;
+ __u32 cur_pkt_size;
+ __u32 last_pkt_size;
+
+ __u8 hh[14];
+ /* = {
+ 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
+
+ We fill in SRC address later
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x08, 0x00
+ };
+ */
+ __u16 pad; /* pad out the hh struct to an even 16 bytes */
+
+ struct sk_buff *skb; /* skb we are to transmit next, used for when we
+ * are transmitting the same one multiple times
+ */
+ struct net_device *odev; /* The out-going device.
+ * Note that the device should have it's
+ * pg_info pointer pointing back to this
+ * device.
+ * Set when the user specifies the out-going
+ * device name (not when the inject is
+ * started as it used to do.)
+ */
+ char odevname[32];
+ struct flow_state *flows;
+ unsigned int cflows; /* Concurrent flows (config) */
+ unsigned int lflow; /* Flow length (config) */
+ unsigned int nflows; /* accumulated flows (stats) */
+ unsigned int curfl; /* current sequenced flow (state)*/
+
+ u16 queue_map_min;
+ u16 queue_map_max;
+ __u32 skb_priority; /* skb priority field */
+ unsigned int burst; /* number of duplicated packets to burst */
+ int node; /* Memory node */
+
+#ifdef CONFIG_XFRM
+ __u8 ipsmode; /* IPSEC mode (config) */
+ __u8 ipsproto; /* IPSEC type (config) */
+ __u32 spi;
+ struct xfrm_dst xdst;
+ struct dst_ops dstops;
+#endif
+ char result[512];
+};
+
+struct pktgen_hdr {
+ __be32 pgh_magic;
+ __be32 seq_num;
+ __be32 tv_sec;
+ __be32 tv_usec;
+};
+
+
+static unsigned int pg_net_id __read_mostly;
+
+struct pktgen_net {
+ struct net *net;
+ struct proc_dir_entry *proc_dir;
+ struct list_head pktgen_threads;
+ bool pktgen_exiting;
+};
+
+struct pktgen_thread {
+ struct mutex if_lock; /* for list of devices */
+ struct list_head if_list; /* All device here */
+ struct list_head th_list;
+ struct task_struct *tsk;
+ char result[512];
+
+ /* Field for thread to receive "posted" events terminate,
+ stop ifs etc. */
+
+ u32 control;
+ int cpu;
+
+ wait_queue_head_t queue;
+ struct completion start_done;
+ struct pktgen_net *net;
+};
+
+#define REMOVE 1
+#define FIND 0
+
+static const char version[] =
+ "Packet Generator for packet performance testing. "
+ "Version: " VERSION "\n";
+
+static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i);
+static int pktgen_add_device(struct pktgen_thread *t, const char *ifname);
+static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
+ const char *ifname, bool exact);
+static int pktgen_device_event(struct notifier_block *, unsigned long, void *);
+static void pktgen_run_all_threads(struct pktgen_net *pn);
+static void pktgen_reset_all_threads(struct pktgen_net *pn);
+static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn);
+
+static void pktgen_stop(struct pktgen_thread *t);
+static void pktgen_clear_counters(struct pktgen_dev *pkt_dev);
+
+/* Module parameters, defaults. */
+static int pg_count_d __read_mostly = 1000;
+static int pg_delay_d __read_mostly;
+static int pg_clone_skb_d __read_mostly;
+static int debug __read_mostly;
+
+static DEFINE_MUTEX(pktgen_thread_lock);
+
+static struct notifier_block pktgen_notifier_block = {
+ .notifier_call = pktgen_device_event,
+};
+
+/*
+ * /proc handling functions
+ *
+ */
+
+static int pgctrl_show(struct seq_file *seq, void *v)
+{
+ seq_puts(seq, version);
+ return 0;
+}
+
+static ssize_t pgctrl_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char data[128];
+ struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id);
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (count == 0)
+ return -EINVAL;
+
+ if (count > sizeof(data))
+ count = sizeof(data);
+
+ if (copy_from_user(data, buf, count))
+ return -EFAULT;
+
+ data[count - 1] = 0; /* Strip trailing '\n' and terminate string */
+
+ if (!strcmp(data, "stop"))
+ pktgen_stop_all_threads_ifs(pn);
+
+ else if (!strcmp(data, "start"))
+ pktgen_run_all_threads(pn);
+
+ else if (!strcmp(data, "reset"))
+ pktgen_reset_all_threads(pn);
+
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static int pgctrl_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pgctrl_show, PDE_DATA(inode));
+}
+
+static const struct file_operations pktgen_fops = {
+ .open = pgctrl_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = pgctrl_write,
+ .release = single_release,
+};
+
+static int pktgen_if_show(struct seq_file *seq, void *v)
+{
+ const struct pktgen_dev *pkt_dev = seq->private;
+ ktime_t stopped;
+ unsigned int i;
+ u64 idle;
+
+ seq_printf(seq,
+ "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n",
+ (unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size,
+ pkt_dev->max_pkt_size);
+
+ seq_printf(seq,
+ " frags: %d delay: %llu clone_skb: %d ifname: %s\n",
+ pkt_dev->nfrags, (unsigned long long) pkt_dev->delay,
+ pkt_dev->clone_skb, pkt_dev->odevname);
+
+ seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows,
+ pkt_dev->lflow);
+
+ seq_printf(seq,
+ " queue_map_min: %u queue_map_max: %u\n",
+ pkt_dev->queue_map_min,
+ pkt_dev->queue_map_max);
+
+ if (pkt_dev->skb_priority)
+ seq_printf(seq, " skb_priority: %u\n",
+ pkt_dev->skb_priority);
+
+ if (pkt_dev->flags & F_IPV6) {
+ seq_printf(seq,
+ " saddr: %pI6c min_saddr: %pI6c max_saddr: %pI6c\n"
+ " daddr: %pI6c min_daddr: %pI6c max_daddr: %pI6c\n",
+ &pkt_dev->in6_saddr,
+ &pkt_dev->min_in6_saddr, &pkt_dev->max_in6_saddr,
+ &pkt_dev->in6_daddr,
+ &pkt_dev->min_in6_daddr, &pkt_dev->max_in6_daddr);
+ } else {
+ seq_printf(seq,
+ " dst_min: %s dst_max: %s\n",
+ pkt_dev->dst_min, pkt_dev->dst_max);
+ seq_printf(seq,
+ " src_min: %s src_max: %s\n",
+ pkt_dev->src_min, pkt_dev->src_max);
+ }
+
+ seq_puts(seq, " src_mac: ");
+
+ seq_printf(seq, "%pM ",
+ is_zero_ether_addr(pkt_dev->src_mac) ?
+ pkt_dev->odev->dev_addr : pkt_dev->src_mac);
+
+ seq_puts(seq, "dst_mac: ");
+ seq_printf(seq, "%pM\n", pkt_dev->dst_mac);
+
+ seq_printf(seq,
+ " udp_src_min: %d udp_src_max: %d"
+ " udp_dst_min: %d udp_dst_max: %d\n",
+ pkt_dev->udp_src_min, pkt_dev->udp_src_max,
+ pkt_dev->udp_dst_min, pkt_dev->udp_dst_max);
+
+ seq_printf(seq,
+ " src_mac_count: %d dst_mac_count: %d\n",
+ pkt_dev->src_mac_count, pkt_dev->dst_mac_count);
+
+ if (pkt_dev->nr_labels) {
+ seq_puts(seq, " mpls: ");
+ for (i = 0; i < pkt_dev->nr_labels; i++)
+ seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]),
+ i == pkt_dev->nr_labels-1 ? "\n" : ", ");
+ }
+
+ if (pkt_dev->vlan_id != 0xffff)
+ seq_printf(seq, " vlan_id: %u vlan_p: %u vlan_cfi: %u\n",
+ pkt_dev->vlan_id, pkt_dev->vlan_p,
+ pkt_dev->vlan_cfi);
+
+ if (pkt_dev->svlan_id != 0xffff)
+ seq_printf(seq, " svlan_id: %u vlan_p: %u vlan_cfi: %u\n",
+ pkt_dev->svlan_id, pkt_dev->svlan_p,
+ pkt_dev->svlan_cfi);
+
+ if (pkt_dev->tos)
+ seq_printf(seq, " tos: 0x%02x\n", pkt_dev->tos);
+
+ if (pkt_dev->traffic_class)
+ seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class);
+
+ if (pkt_dev->burst > 1)
+ seq_printf(seq, " burst: %d\n", pkt_dev->burst);
+
+ if (pkt_dev->node >= 0)
+ seq_printf(seq, " node: %d\n", pkt_dev->node);
+
+ if (pkt_dev->xmit_mode == M_NETIF_RECEIVE)
+ seq_puts(seq, " xmit_mode: netif_receive\n");
+ else if (pkt_dev->xmit_mode == M_QUEUE_XMIT)
+ seq_puts(seq, " xmit_mode: xmit_queue\n");
+
+ seq_puts(seq, " Flags: ");
+
+ for (i = 0; i < NR_PKT_FLAGS; i++) {
+ if (i == F_FLOW_SEQ)
+ if (!pkt_dev->cflows)
+ continue;
+
+ if (pkt_dev->flags & (1 << i))
+ seq_printf(seq, "%s ", pkt_flag_names[i]);
+ else if (i == F_FLOW_SEQ)
+ seq_puts(seq, "FLOW_RND ");
+
+#ifdef CONFIG_XFRM
+ if (i == F_IPSEC && pkt_dev->spi)
+ seq_printf(seq, "spi:%u", pkt_dev->spi);
+#endif
+ }
+
+ seq_puts(seq, "\n");
+
+ /* not really stopped, more like last-running-at */
+ stopped = pkt_dev->running ? ktime_get() : pkt_dev->stopped_at;
+ idle = pkt_dev->idle_acc;
+ do_div(idle, NSEC_PER_USEC);
+
+ seq_printf(seq,
+ "Current:\n pkts-sofar: %llu errors: %llu\n",
+ (unsigned long long)pkt_dev->sofar,
+ (unsigned long long)pkt_dev->errors);
+
+ seq_printf(seq,
+ " started: %lluus stopped: %lluus idle: %lluus\n",
+ (unsigned long long) ktime_to_us(pkt_dev->started_at),
+ (unsigned long long) ktime_to_us(stopped),
+ (unsigned long long) idle);
+
+ seq_printf(seq,
+ " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n",
+ pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset,
+ pkt_dev->cur_src_mac_offset);
+
+ if (pkt_dev->flags & F_IPV6) {
+ seq_printf(seq, " cur_saddr: %pI6c cur_daddr: %pI6c\n",
+ &pkt_dev->cur_in6_saddr,
+ &pkt_dev->cur_in6_daddr);
+ } else
+ seq_printf(seq, " cur_saddr: %pI4 cur_daddr: %pI4\n",
+ &pkt_dev->cur_saddr, &pkt_dev->cur_daddr);
+
+ seq_printf(seq, " cur_udp_dst: %d cur_udp_src: %d\n",
+ pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src);
+
+ seq_printf(seq, " cur_queue_map: %u\n", pkt_dev->cur_queue_map);
+
+ seq_printf(seq, " flows: %u\n", pkt_dev->nflows);
+
+ if (pkt_dev->result[0])
+ seq_printf(seq, "Result: %s\n", pkt_dev->result);
+ else
+ seq_puts(seq, "Result: Idle\n");
+
+ return 0;
+}
+
+
+static int hex32_arg(const char __user *user_buffer, unsigned long maxlen,
+ __u32 *num)
+{
+ int i = 0;
+ *num = 0;
+
+ for (; i < maxlen; i++) {
+ int value;
+ char c;
+ *num <<= 4;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ value = hex_to_bin(c);
+ if (value >= 0)
+ *num |= value;
+ else
+ break;
+ }
+ return i;
+}
+
+static int count_trail_chars(const char __user * user_buffer,
+ unsigned int maxlen)
+{
+ int i;
+
+ for (i = 0; i < maxlen; i++) {
+ char c;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ switch (c) {
+ case '\"':
+ case '\n':
+ case '\r':
+ case '\t':
+ case ' ':
+ case '=':
+ break;
+ default:
+ goto done;
+ }
+ }
+done:
+ return i;
+}
+
+static long num_arg(const char __user *user_buffer, unsigned long maxlen,
+ unsigned long *num)
+{
+ int i;
+ *num = 0;
+
+ for (i = 0; i < maxlen; i++) {
+ char c;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ if ((c >= '0') && (c <= '9')) {
+ *num *= 10;
+ *num += c - '0';
+ } else
+ break;
+ }
+ return i;
+}
+
+static int strn_len(const char __user * user_buffer, unsigned int maxlen)
+{
+ int i;
+
+ for (i = 0; i < maxlen; i++) {
+ char c;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ switch (c) {
+ case '\"':
+ case '\n':
+ case '\r':
+ case '\t':
+ case ' ':
+ goto done_str;
+ default:
+ break;
+ }
+ }
+done_str:
+ return i;
+}
+
+static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev)
+{
+ unsigned int n = 0;
+ char c;
+ ssize_t i = 0;
+ int len;
+
+ pkt_dev->nr_labels = 0;
+ do {
+ __u32 tmp;
+ len = hex32_arg(&buffer[i], 8, &tmp);
+ if (len <= 0)
+ return len;
+ pkt_dev->labels[n] = htonl(tmp);
+ if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM)
+ pkt_dev->flags |= F_MPLS_RND;
+ i += len;
+ if (get_user(c, &buffer[i]))
+ return -EFAULT;
+ i++;
+ n++;
+ if (n >= MAX_MPLS_LABELS)
+ return -E2BIG;
+ } while (c == ',');
+
+ pkt_dev->nr_labels = n;
+ return i;
+}
+
+static __u32 pktgen_read_flag(const char *f, bool *disable)
+{
+ __u32 i;
+
+ if (f[0] == '!') {
+ *disable = true;
+ f++;
+ }
+
+ for (i = 0; i < NR_PKT_FLAGS; i++) {
+ if (!IS_ENABLED(CONFIG_XFRM) && i == IPSEC_SHIFT)
+ continue;
+
+ /* allow only disabling ipv6 flag */
+ if (!*disable && i == IPV6_SHIFT)
+ continue;
+
+ if (strcmp(f, pkt_flag_names[i]) == 0)
+ return 1 << i;
+ }
+
+ if (strcmp(f, "FLOW_RND") == 0) {
+ *disable = !*disable;
+ return F_FLOW_SEQ;
+ }
+
+ return 0;
+}
+
+static ssize_t pktgen_if_write(struct file *file,
+ const char __user * user_buffer, size_t count,
+ loff_t * offset)
+{
+ struct seq_file *seq = file->private_data;
+ struct pktgen_dev *pkt_dev = seq->private;
+ int i, max, len;
+ char name[16], valstr[32];
+ unsigned long value = 0;
+ char *pg_result = NULL;
+ int tmp = 0;
+ char buf[128];
+
+ pg_result = &(pkt_dev->result[0]);
+
+ if (count < 1) {
+ pr_warn("wrong command format\n");
+ return -EINVAL;
+ }
+
+ max = count;
+ tmp = count_trail_chars(user_buffer, max);
+ if (tmp < 0) {
+ pr_warn("illegal format\n");
+ return tmp;
+ }
+ i = tmp;
+
+ /* Read variable name */
+
+ len = strn_len(&user_buffer[i], sizeof(name) - 1);
+ if (len < 0)
+ return len;
+
+ memset(name, 0, sizeof(name));
+ if (copy_from_user(name, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+
+ max = count - i;
+ len = count_trail_chars(&user_buffer[i], max);
+ if (len < 0)
+ return len;
+
+ i += len;
+
+ if (debug) {
+ size_t copy = min_t(size_t, count + 1, 1024);
+ char *tp = strndup_user(user_buffer, copy);
+
+ if (IS_ERR(tp))
+ return PTR_ERR(tp);
+
+ pr_debug("%s,%zu buffer -:%s:-\n", name, count, tp);
+ kfree(tp);
+ }
+
+ if (!strcmp(name, "min_pkt_size")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value < 14 + 20 + 8)
+ value = 14 + 20 + 8;
+ if (value != pkt_dev->min_pkt_size) {
+ pkt_dev->min_pkt_size = value;
+ pkt_dev->cur_pkt_size = value;
+ }
+ sprintf(pg_result, "OK: min_pkt_size=%u",
+ pkt_dev->min_pkt_size);
+ return count;
+ }
+
+ if (!strcmp(name, "max_pkt_size")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value < 14 + 20 + 8)
+ value = 14 + 20 + 8;
+ if (value != pkt_dev->max_pkt_size) {
+ pkt_dev->max_pkt_size = value;
+ pkt_dev->cur_pkt_size = value;
+ }
+ sprintf(pg_result, "OK: max_pkt_size=%u",
+ pkt_dev->max_pkt_size);
+ return count;
+ }
+
+ /* Shortcut for min = max */
+
+ if (!strcmp(name, "pkt_size")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value < 14 + 20 + 8)
+ value = 14 + 20 + 8;
+ if (value != pkt_dev->min_pkt_size) {
+ pkt_dev->min_pkt_size = value;
+ pkt_dev->max_pkt_size = value;
+ pkt_dev->cur_pkt_size = value;
+ }
+ sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size);
+ return count;
+ }
+
+ if (!strcmp(name, "debug")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ debug = value;
+ sprintf(pg_result, "OK: debug=%u", debug);
+ return count;
+ }
+
+ if (!strcmp(name, "frags")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->nfrags = value;
+ sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags);
+ return count;
+ }
+ if (!strcmp(name, "delay")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value == 0x7FFFFFFF)
+ pkt_dev->delay = ULLONG_MAX;
+ else
+ pkt_dev->delay = (u64)value;
+
+ sprintf(pg_result, "OK: delay=%llu",
+ (unsigned long long) pkt_dev->delay);
+ return count;
+ }
+ if (!strcmp(name, "rate")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (!value)
+ return len;
+ pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value;
+ if (debug)
+ pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
+
+ sprintf(pg_result, "OK: rate=%lu", value);
+ return count;
+ }
+ if (!strcmp(name, "ratep")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (!value)
+ return len;
+ pkt_dev->delay = NSEC_PER_SEC/value;
+ if (debug)
+ pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
+
+ sprintf(pg_result, "OK: rate=%lu", value);
+ return count;
+ }
+ if (!strcmp(name, "udp_src_min")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value != pkt_dev->udp_src_min) {
+ pkt_dev->udp_src_min = value;
+ pkt_dev->cur_udp_src = value;
+ }
+ sprintf(pg_result, "OK: udp_src_min=%u", pkt_dev->udp_src_min);
+ return count;
+ }
+ if (!strcmp(name, "udp_dst_min")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value != pkt_dev->udp_dst_min) {
+ pkt_dev->udp_dst_min = value;
+ pkt_dev->cur_udp_dst = value;
+ }
+ sprintf(pg_result, "OK: udp_dst_min=%u", pkt_dev->udp_dst_min);
+ return count;
+ }
+ if (!strcmp(name, "udp_src_max")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value != pkt_dev->udp_src_max) {
+ pkt_dev->udp_src_max = value;
+ pkt_dev->cur_udp_src = value;
+ }
+ sprintf(pg_result, "OK: udp_src_max=%u", pkt_dev->udp_src_max);
+ return count;
+ }
+ if (!strcmp(name, "udp_dst_max")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value != pkt_dev->udp_dst_max) {
+ pkt_dev->udp_dst_max = value;
+ pkt_dev->cur_udp_dst = value;
+ }
+ sprintf(pg_result, "OK: udp_dst_max=%u", pkt_dev->udp_dst_max);
+ return count;
+ }
+ if (!strcmp(name, "clone_skb")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+ if ((value > 0) &&
+ ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
+ !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
+ return -ENOTSUPP;
+ i += len;
+ pkt_dev->clone_skb = value;
+
+ sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb);
+ return count;
+ }
+ if (!strcmp(name, "count")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->count = value;
+ sprintf(pg_result, "OK: count=%llu",
+ (unsigned long long)pkt_dev->count);
+ return count;
+ }
+ if (!strcmp(name, "src_mac_count")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (pkt_dev->src_mac_count != value) {
+ pkt_dev->src_mac_count = value;
+ pkt_dev->cur_src_mac_offset = 0;
+ }
+ sprintf(pg_result, "OK: src_mac_count=%d",
+ pkt_dev->src_mac_count);
+ return count;
+ }
+ if (!strcmp(name, "dst_mac_count")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (pkt_dev->dst_mac_count != value) {
+ pkt_dev->dst_mac_count = value;
+ pkt_dev->cur_dst_mac_offset = 0;
+ }
+ sprintf(pg_result, "OK: dst_mac_count=%d",
+ pkt_dev->dst_mac_count);
+ return count;
+ }
+ if (!strcmp(name, "burst")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value > 1) &&
+ ((pkt_dev->xmit_mode == M_QUEUE_XMIT) ||
+ ((pkt_dev->xmit_mode == M_START_XMIT) &&
+ (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))))
+ return -ENOTSUPP;
+ pkt_dev->burst = value < 1 ? 1 : value;
+ sprintf(pg_result, "OK: burst=%d", pkt_dev->burst);
+ return count;
+ }
+ if (!strcmp(name, "node")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+
+ if (node_possible(value)) {
+ pkt_dev->node = value;
+ sprintf(pg_result, "OK: node=%d", pkt_dev->node);
+ if (pkt_dev->page) {
+ put_page(pkt_dev->page);
+ pkt_dev->page = NULL;
+ }
+ }
+ else
+ sprintf(pg_result, "ERROR: node not possible");
+ return count;
+ }
+ if (!strcmp(name, "xmit_mode")) {
+ char f[32];
+
+ memset(f, 0, 32);
+ len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ if (len < 0)
+ return len;
+
+ if (copy_from_user(f, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+
+ if (strcmp(f, "start_xmit") == 0) {
+ pkt_dev->xmit_mode = M_START_XMIT;
+ } else if (strcmp(f, "netif_receive") == 0) {
+ /* clone_skb set earlier, not supported in this mode */
+ if (pkt_dev->clone_skb > 0)
+ return -ENOTSUPP;
+
+ pkt_dev->xmit_mode = M_NETIF_RECEIVE;
+
+ /* make sure new packet is allocated every time
+ * pktgen_xmit() is called
+ */
+ pkt_dev->last_ok = 1;
+
+ /* override clone_skb if user passed default value
+ * at module loading time
+ */
+ pkt_dev->clone_skb = 0;
+ } else if (strcmp(f, "queue_xmit") == 0) {
+ pkt_dev->xmit_mode = M_QUEUE_XMIT;
+ pkt_dev->last_ok = 1;
+ } else {
+ sprintf(pg_result,
+ "xmit_mode -:%s:- unknown\nAvailable modes: %s",
+ f, "start_xmit, netif_receive\n");
+ return count;
+ }
+ sprintf(pg_result, "OK: xmit_mode=%s", f);
+ return count;
+ }
+ if (!strcmp(name, "flag")) {
+ __u32 flag;
+ char f[32];
+ bool disable = false;
+
+ memset(f, 0, 32);
+ len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ if (len < 0)
+ return len;
+
+ if (copy_from_user(f, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+
+ flag = pktgen_read_flag(f, &disable);
+
+ if (flag) {
+ if (disable)
+ pkt_dev->flags &= ~flag;
+ else
+ pkt_dev->flags |= flag;
+ } else {
+ sprintf(pg_result,
+ "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
+ f,
+ "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
+ "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
+ "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
+ "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
+ "NO_TIMESTAMP, "
+#ifdef CONFIG_XFRM
+ "IPSEC, "
+#endif
+ "NODE_ALLOC\n");
+ return count;
+ }
+ sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
+ return count;
+ }
+ if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
+ len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1);
+ if (len < 0)
+ return len;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->dst_min) != 0) {
+ memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
+ strcpy(pkt_dev->dst_min, buf);
+ pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
+ pkt_dev->cur_daddr = pkt_dev->daddr_min;
+ }
+ if (debug)
+ pr_debug("dst_min set to: %s\n", pkt_dev->dst_min);
+ i += len;
+ sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min);
+ return count;
+ }
+ if (!strcmp(name, "dst_max")) {
+ len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1);
+ if (len < 0)
+ return len;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->dst_max) != 0) {
+ memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
+ strcpy(pkt_dev->dst_max, buf);
+ pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
+ pkt_dev->cur_daddr = pkt_dev->daddr_max;
+ }
+ if (debug)
+ pr_debug("dst_max set to: %s\n", pkt_dev->dst_max);
+ i += len;
+ sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max);
+ return count;
+ }
+ if (!strcmp(name, "dst6")) {
+ len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ if (len < 0)
+ return len;
+
+ pkt_dev->flags |= F_IPV6;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+
+ in6_pton(buf, -1, pkt_dev->in6_daddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr);
+
+ pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr;
+
+ if (debug)
+ pr_debug("dst6 set to: %s\n", buf);
+
+ i += len;
+ sprintf(pg_result, "OK: dst6=%s", buf);
+ return count;
+ }
+ if (!strcmp(name, "dst6_min")) {
+ len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ if (len < 0)
+ return len;
+
+ pkt_dev->flags |= F_IPV6;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+
+ in6_pton(buf, -1, pkt_dev->min_in6_daddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr);
+
+ pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr;
+ if (debug)
+ pr_debug("dst6_min set to: %s\n", buf);
+
+ i += len;
+ sprintf(pg_result, "OK: dst6_min=%s", buf);
+ return count;
+ }
+ if (!strcmp(name, "dst6_max")) {
+ len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ if (len < 0)
+ return len;
+
+ pkt_dev->flags |= F_IPV6;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+
+ in6_pton(buf, -1, pkt_dev->max_in6_daddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_daddr);
+
+ if (debug)
+ pr_debug("dst6_max set to: %s\n", buf);
+
+ i += len;
+ sprintf(pg_result, "OK: dst6_max=%s", buf);
+ return count;
+ }
+ if (!strcmp(name, "src6")) {
+ len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ if (len < 0)
+ return len;
+
+ pkt_dev->flags |= F_IPV6;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+
+ in6_pton(buf, -1, pkt_dev->in6_saddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr);
+
+ pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr;
+
+ if (debug)
+ pr_debug("src6 set to: %s\n", buf);
+
+ i += len;
+ sprintf(pg_result, "OK: src6=%s", buf);
+ return count;
+ }
+ if (!strcmp(name, "src_min")) {
+ len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1);
+ if (len < 0)
+ return len;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->src_min) != 0) {
+ memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
+ strcpy(pkt_dev->src_min, buf);
+ pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
+ pkt_dev->cur_saddr = pkt_dev->saddr_min;
+ }
+ if (debug)
+ pr_debug("src_min set to: %s\n", pkt_dev->src_min);
+ i += len;
+ sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min);
+ return count;
+ }
+ if (!strcmp(name, "src_max")) {
+ len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1);
+ if (len < 0)
+ return len;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->src_max) != 0) {
+ memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
+ strcpy(pkt_dev->src_max, buf);
+ pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
+ pkt_dev->cur_saddr = pkt_dev->saddr_max;
+ }
+ if (debug)
+ pr_debug("src_max set to: %s\n", pkt_dev->src_max);
+ i += len;
+ sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max);
+ return count;
+ }
+ if (!strcmp(name, "dst_mac")) {
+ len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
+ if (len < 0)
+ return len;
+
+ memset(valstr, 0, sizeof(valstr));
+ if (copy_from_user(valstr, &user_buffer[i], len))
+ return -EFAULT;
+
+ if (!mac_pton(valstr, pkt_dev->dst_mac))
+ return -EINVAL;
+ /* Set up Dest MAC */
+ ether_addr_copy(&pkt_dev->hh[0], pkt_dev->dst_mac);
+
+ sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac);
+ return count;
+ }
+ if (!strcmp(name, "src_mac")) {
+ len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
+ if (len < 0)
+ return len;
+
+ memset(valstr, 0, sizeof(valstr));
+ if (copy_from_user(valstr, &user_buffer[i], len))
+ return -EFAULT;
+
+ if (!mac_pton(valstr, pkt_dev->src_mac))
+ return -EINVAL;
+ /* Set up Src MAC */
+ ether_addr_copy(&pkt_dev->hh[6], pkt_dev->src_mac);
+
+ sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac);
+ return count;
+ }
+
+ if (!strcmp(name, "clear_counters")) {
+ pktgen_clear_counters(pkt_dev);
+ sprintf(pg_result, "OK: Clearing counters.\n");
+ return count;
+ }
+
+ if (!strcmp(name, "flows")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value > MAX_CFLOWS)
+ value = MAX_CFLOWS;
+
+ pkt_dev->cflows = value;
+ sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows);
+ return count;
+ }
+#ifdef CONFIG_XFRM
+ if (!strcmp(name, "spi")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->spi = value;
+ sprintf(pg_result, "OK: spi=%u", pkt_dev->spi);
+ return count;
+ }
+#endif
+ if (!strcmp(name, "flowlen")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->lflow = value;
+ sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow);
+ return count;
+ }
+
+ if (!strcmp(name, "queue_map_min")) {
+ len = num_arg(&user_buffer[i], 5, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->queue_map_min = value;
+ sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min);
+ return count;
+ }
+
+ if (!strcmp(name, "queue_map_max")) {
+ len = num_arg(&user_buffer[i], 5, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->queue_map_max = value;
+ sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max);
+ return count;
+ }
+
+ if (!strcmp(name, "mpls")) {
+ unsigned int n, cnt;
+
+ len = get_labels(&user_buffer[i], pkt_dev);
+ if (len < 0)
+ return len;
+ i += len;
+ cnt = sprintf(pg_result, "OK: mpls=");
+ for (n = 0; n < pkt_dev->nr_labels; n++)
+ cnt += sprintf(pg_result + cnt,
+ "%08x%s", ntohl(pkt_dev->labels[n]),
+ n == pkt_dev->nr_labels-1 ? "" : ",");
+
+ if (pkt_dev->nr_labels && pkt_dev->vlan_id != 0xffff) {
+ pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
+ pkt_dev->svlan_id = 0xffff;
+
+ if (debug)
+ pr_debug("VLAN/SVLAN auto turned off\n");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "vlan_id")) {
+ len = num_arg(&user_buffer[i], 4, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value <= 4095) {
+ pkt_dev->vlan_id = value; /* turn on VLAN */
+
+ if (debug)
+ pr_debug("VLAN turned on\n");
+
+ if (debug && pkt_dev->nr_labels)
+ pr_debug("MPLS auto turned off\n");
+
+ pkt_dev->nr_labels = 0; /* turn off MPLS */
+ sprintf(pg_result, "OK: vlan_id=%u", pkt_dev->vlan_id);
+ } else {
+ pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
+ pkt_dev->svlan_id = 0xffff;
+
+ if (debug)
+ pr_debug("VLAN/SVLAN turned off\n");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "vlan_p")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) {
+ pkt_dev->vlan_p = value;
+ sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p);
+ } else {
+ sprintf(pg_result, "ERROR: vlan_p must be 0-7");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "vlan_cfi")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) {
+ pkt_dev->vlan_cfi = value;
+ sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi);
+ } else {
+ sprintf(pg_result, "ERROR: vlan_cfi must be 0-1");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "svlan_id")) {
+ len = num_arg(&user_buffer[i], 4, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) {
+ pkt_dev->svlan_id = value; /* turn on SVLAN */
+
+ if (debug)
+ pr_debug("SVLAN turned on\n");
+
+ if (debug && pkt_dev->nr_labels)
+ pr_debug("MPLS auto turned off\n");
+
+ pkt_dev->nr_labels = 0; /* turn off MPLS */
+ sprintf(pg_result, "OK: svlan_id=%u", pkt_dev->svlan_id);
+ } else {
+ pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
+ pkt_dev->svlan_id = 0xffff;
+
+ if (debug)
+ pr_debug("VLAN/SVLAN turned off\n");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "svlan_p")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) {
+ pkt_dev->svlan_p = value;
+ sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p);
+ } else {
+ sprintf(pg_result, "ERROR: svlan_p must be 0-7");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "svlan_cfi")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) {
+ pkt_dev->svlan_cfi = value;
+ sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi);
+ } else {
+ sprintf(pg_result, "ERROR: svlan_cfi must be 0-1");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "tos")) {
+ __u32 tmp_value = 0;
+ len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (len == 2) {
+ pkt_dev->tos = tmp_value;
+ sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos);
+ } else {
+ sprintf(pg_result, "ERROR: tos must be 00-ff");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "traffic_class")) {
+ __u32 tmp_value = 0;
+ len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (len == 2) {
+ pkt_dev->traffic_class = tmp_value;
+ sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class);
+ } else {
+ sprintf(pg_result, "ERROR: traffic_class must be 00-ff");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "skb_priority")) {
+ len = num_arg(&user_buffer[i], 9, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->skb_priority = value;
+ sprintf(pg_result, "OK: skb_priority=%i",
+ pkt_dev->skb_priority);
+ return count;
+ }
+
+ sprintf(pkt_dev->result, "No such parameter \"%s\"", name);
+ return -EINVAL;
+}
+
+static int pktgen_if_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pktgen_if_show, PDE_DATA(inode));
+}
+
+static const struct file_operations pktgen_if_fops = {
+ .open = pktgen_if_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = pktgen_if_write,
+ .release = single_release,
+};
+
+static int pktgen_thread_show(struct seq_file *seq, void *v)
+{
+ struct pktgen_thread *t = seq->private;
+ const struct pktgen_dev *pkt_dev;
+
+ BUG_ON(!t);
+
+ seq_puts(seq, "Running: ");
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
+ if (pkt_dev->running)
+ seq_printf(seq, "%s ", pkt_dev->odevname);
+
+ seq_puts(seq, "\nStopped: ");
+
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
+ if (!pkt_dev->running)
+ seq_printf(seq, "%s ", pkt_dev->odevname);
+
+ if (t->result[0])
+ seq_printf(seq, "\nResult: %s\n", t->result);
+ else
+ seq_puts(seq, "\nResult: NA\n");
+
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static ssize_t pktgen_thread_write(struct file *file,
+ const char __user * user_buffer,
+ size_t count, loff_t * offset)
+{
+ struct seq_file *seq = file->private_data;
+ struct pktgen_thread *t = seq->private;
+ int i, max, len, ret;
+ char name[40];
+ char *pg_result;
+
+ if (count < 1) {
+ // sprintf(pg_result, "Wrong command format");
+ return -EINVAL;
+ }
+
+ max = count;
+ len = count_trail_chars(user_buffer, max);
+ if (len < 0)
+ return len;
+
+ i = len;
+
+ /* Read variable name */
+
+ len = strn_len(&user_buffer[i], sizeof(name) - 1);
+ if (len < 0)
+ return len;
+
+ memset(name, 0, sizeof(name));
+ if (copy_from_user(name, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+
+ max = count - i;
+ len = count_trail_chars(&user_buffer[i], max);
+ if (len < 0)
+ return len;
+
+ i += len;
+
+ if (debug)
+ pr_debug("t=%s, count=%lu\n", name, (unsigned long)count);
+
+ if (!t) {
+ pr_err("ERROR: No thread\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ pg_result = &(t->result[0]);
+
+ if (!strcmp(name, "add_device")) {
+ char f[32];
+ memset(f, 0, 32);
+ len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ if (len < 0) {
+ ret = len;
+ goto out;
+ }
+ if (copy_from_user(f, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+ mutex_lock(&pktgen_thread_lock);
+ ret = pktgen_add_device(t, f);
+ mutex_unlock(&pktgen_thread_lock);
+ if (!ret) {
+ ret = count;
+ sprintf(pg_result, "OK: add_device=%s", f);
+ } else
+ sprintf(pg_result, "ERROR: can not add device %s", f);
+ goto out;
+ }
+
+ if (!strcmp(name, "rem_device_all")) {
+ mutex_lock(&pktgen_thread_lock);
+ t->control |= T_REMDEVALL;
+ mutex_unlock(&pktgen_thread_lock);
+ schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */
+ ret = count;
+ sprintf(pg_result, "OK: rem_device_all");
+ goto out;
+ }
+
+ if (!strcmp(name, "max_before_softirq")) {
+ sprintf(pg_result, "OK: Note! max_before_softirq is obsoleted -- Do not use");
+ ret = count;
+ goto out;
+ }
+
+ ret = -EINVAL;
+out:
+ return ret;
+}
+
+static int pktgen_thread_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pktgen_thread_show, PDE_DATA(inode));
+}
+
+static const struct file_operations pktgen_thread_fops = {
+ .open = pktgen_thread_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = pktgen_thread_write,
+ .release = single_release,
+};
+
+/* Think find or remove for NN */
+static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn,
+ const char *ifname, int remove)
+{
+ struct pktgen_thread *t;
+ struct pktgen_dev *pkt_dev = NULL;
+ bool exact = (remove == FIND);
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list) {
+ pkt_dev = pktgen_find_dev(t, ifname, exact);
+ if (pkt_dev) {
+ if (remove) {
+ pkt_dev->removal_mark = 1;
+ t->control |= T_REMDEV;
+ }
+ break;
+ }
+ }
+ return pkt_dev;
+}
+
+/*
+ * mark a device for removal
+ */
+static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname)
+{
+ struct pktgen_dev *pkt_dev = NULL;
+ const int max_tries = 10, msec_per_try = 125;
+ int i = 0;
+
+ mutex_lock(&pktgen_thread_lock);
+ pr_debug("%s: marking %s for removal\n", __func__, ifname);
+
+ while (1) {
+
+ pkt_dev = __pktgen_NN_threads(pn, ifname, REMOVE);
+ if (pkt_dev == NULL)
+ break; /* success */
+
+ mutex_unlock(&pktgen_thread_lock);
+ pr_debug("%s: waiting for %s to disappear....\n",
+ __func__, ifname);
+ schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try));
+ mutex_lock(&pktgen_thread_lock);
+
+ if (++i >= max_tries) {
+ pr_err("%s: timed out after waiting %d msec for device %s to be removed\n",
+ __func__, msec_per_try * i, ifname);
+ break;
+ }
+
+ }
+
+ mutex_unlock(&pktgen_thread_lock);
+}
+
+static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *dev)
+{
+ struct pktgen_thread *t;
+
+ mutex_lock(&pktgen_thread_lock);
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list) {
+ struct pktgen_dev *pkt_dev;
+
+ if_lock(t);
+ list_for_each_entry(pkt_dev, &t->if_list, list) {
+ if (pkt_dev->odev != dev)
+ continue;
+
+ proc_remove(pkt_dev->entry);
+
+ pkt_dev->entry = proc_create_data(dev->name, 0600,
+ pn->proc_dir,
+ &pktgen_if_fops,
+ pkt_dev);
+ if (!pkt_dev->entry)
+ pr_err("can't move proc entry for '%s'\n",
+ dev->name);
+ break;
+ }
+ if_unlock(t);
+ }
+ mutex_unlock(&pktgen_thread_lock);
+}
+
+static int pktgen_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct pktgen_net *pn = net_generic(dev_net(dev), pg_net_id);
+
+ if (pn->pktgen_exiting)
+ return NOTIFY_DONE;
+
+ /* It is OK that we do not hold the group lock right now,
+ * as we run under the RTNL lock.
+ */
+
+ switch (event) {
+ case NETDEV_CHANGENAME:
+ pktgen_change_name(pn, dev);
+ break;
+
+ case NETDEV_UNREGISTER:
+ pktgen_mark_device(pn, dev->name);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct net_device *pktgen_dev_get_by_name(const struct pktgen_net *pn,
+ struct pktgen_dev *pkt_dev,
+ const char *ifname)
+{
+ char b[IFNAMSIZ+5];
+ int i;
+
+ for (i = 0; ifname[i] != '@'; i++) {
+ if (i == IFNAMSIZ)
+ break;
+
+ b[i] = ifname[i];
+ }
+ b[i] = 0;
+
+ return dev_get_by_name(pn->net, b);
+}
+
+
+/* Associate pktgen_dev with a device. */
+
+static int pktgen_setup_dev(const struct pktgen_net *pn,
+ struct pktgen_dev *pkt_dev, const char *ifname)
+{
+ struct net_device *odev;
+ int err;
+
+ /* Clean old setups */
+ if (pkt_dev->odev) {
+ dev_put(pkt_dev->odev);
+ pkt_dev->odev = NULL;
+ }
+
+ odev = pktgen_dev_get_by_name(pn, pkt_dev, ifname);
+ if (!odev) {
+ pr_err("no such netdevice: \"%s\"\n", ifname);
+ return -ENODEV;
+ }
+
+ if (odev->type != ARPHRD_ETHER) {
+ pr_err("not an ethernet device: \"%s\"\n", ifname);
+ err = -EINVAL;
+ } else if (!netif_running(odev)) {
+ pr_err("device is down: \"%s\"\n", ifname);
+ err = -ENETDOWN;
+ } else {
+ pkt_dev->odev = odev;
+ return 0;
+ }
+
+ dev_put(odev);
+ return err;
+}
+
+/* Read pkt_dev from the interface and set up internal pktgen_dev
+ * structure to have the right information to create/send packets
+ */
+static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
+{
+ int ntxq;
+
+ if (!pkt_dev->odev) {
+ pr_err("ERROR: pkt_dev->odev == NULL in setup_inject\n");
+ sprintf(pkt_dev->result,
+ "ERROR: pkt_dev->odev == NULL in setup_inject.\n");
+ return;
+ }
+
+ /* make sure that we don't pick a non-existing transmit queue */
+ ntxq = pkt_dev->odev->real_num_tx_queues;
+
+ if (ntxq <= pkt_dev->queue_map_min) {
+ pr_warn("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+ pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,
+ pkt_dev->odevname);
+ pkt_dev->queue_map_min = (ntxq ?: 1) - 1;
+ }
+ if (pkt_dev->queue_map_max >= ntxq) {
+ pr_warn("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+ pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,
+ pkt_dev->odevname);
+ pkt_dev->queue_map_max = (ntxq ?: 1) - 1;
+ }
+
+ /* Default to the interface's mac if not explicitly set. */
+
+ if (is_zero_ether_addr(pkt_dev->src_mac))
+ ether_addr_copy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr);
+
+ /* Set up Dest MAC */
+ ether_addr_copy(&(pkt_dev->hh[0]), pkt_dev->dst_mac);
+
+ if (pkt_dev->flags & F_IPV6) {
+ int i, set = 0, err = 1;
+ struct inet6_dev *idev;
+
+ if (pkt_dev->min_pkt_size == 0) {
+ pkt_dev->min_pkt_size = 14 + sizeof(struct ipv6hdr)
+ + sizeof(struct udphdr)
+ + sizeof(struct pktgen_hdr)
+ + pkt_dev->pkt_overhead;
+ }
+
+ for (i = 0; i < sizeof(struct in6_addr); i++)
+ if (pkt_dev->cur_in6_saddr.s6_addr[i]) {
+ set = 1;
+ break;
+ }
+
+ if (!set) {
+
+ /*
+ * Use linklevel address if unconfigured.
+ *
+ * use ipv6_get_lladdr if/when it's get exported
+ */
+
+ rcu_read_lock();
+ idev = __in6_dev_get(pkt_dev->odev);
+ if (idev) {
+ struct inet6_ifaddr *ifp;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ if ((ifp->scope & IFA_LINK) &&
+ !(ifp->flags & IFA_F_TENTATIVE)) {
+ pkt_dev->cur_in6_saddr = ifp->addr;
+ err = 0;
+ break;
+ }
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ rcu_read_unlock();
+ if (err)
+ pr_err("ERROR: IPv6 link address not available\n");
+ }
+ } else {
+ if (pkt_dev->min_pkt_size == 0) {
+ pkt_dev->min_pkt_size = 14 + sizeof(struct iphdr)
+ + sizeof(struct udphdr)
+ + sizeof(struct pktgen_hdr)
+ + pkt_dev->pkt_overhead;
+ }
+
+ pkt_dev->saddr_min = 0;
+ pkt_dev->saddr_max = 0;
+ if (strlen(pkt_dev->src_min) == 0) {
+
+ struct in_device *in_dev;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(pkt_dev->odev);
+ if (in_dev) {
+ if (in_dev->ifa_list) {
+ pkt_dev->saddr_min =
+ in_dev->ifa_list->ifa_address;
+ pkt_dev->saddr_max = pkt_dev->saddr_min;
+ }
+ }
+ rcu_read_unlock();
+ } else {
+ pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
+ pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
+ }
+
+ pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
+ pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
+ }
+ /* Initialize current values. */
+ pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size;
+ if (pkt_dev->min_pkt_size > pkt_dev->max_pkt_size)
+ pkt_dev->max_pkt_size = pkt_dev->min_pkt_size;
+
+ pkt_dev->cur_dst_mac_offset = 0;
+ pkt_dev->cur_src_mac_offset = 0;
+ pkt_dev->cur_saddr = pkt_dev->saddr_min;
+ pkt_dev->cur_daddr = pkt_dev->daddr_min;
+ pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min;
+ pkt_dev->cur_udp_src = pkt_dev->udp_src_min;
+ pkt_dev->nflows = 0;
+}
+
+
+static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
+{
+ ktime_t start_time, end_time;
+ s64 remaining;
+ struct hrtimer_sleeper t;
+
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_set_expires(&t.timer, spin_until);
+
+ remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
+ if (remaining <= 0)
+ goto out;
+
+ start_time = ktime_get();
+ if (remaining < 100000) {
+ /* for small delays (<100us), just loop until limit is reached */
+ do {
+ end_time = ktime_get();
+ } while (ktime_compare(end_time, spin_until) < 0);
+ } else {
+ /* see do_nanosleep */
+ hrtimer_init_sleeper(&t, current);
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
+
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+ } while (t.task && pkt_dev->running && !signal_pending(current));
+ __set_current_state(TASK_RUNNING);
+ end_time = ktime_get();
+ }
+
+ pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
+out:
+ pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
+ destroy_hrtimer_on_stack(&t.timer);
+}
+
+static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
+{
+ pkt_dev->pkt_overhead = 0;
+ pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32);
+ pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev);
+ pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev);
+}
+
+static inline int f_seen(const struct pktgen_dev *pkt_dev, int flow)
+{
+ return !!(pkt_dev->flows[flow].flags & F_INIT);
+}
+
+static inline int f_pick(struct pktgen_dev *pkt_dev)
+{
+ int flow = pkt_dev->curfl;
+
+ if (pkt_dev->flags & F_FLOW_SEQ) {
+ if (pkt_dev->flows[flow].count >= pkt_dev->lflow) {
+ /* reset time */
+ pkt_dev->flows[flow].count = 0;
+ pkt_dev->flows[flow].flags = 0;
+ pkt_dev->curfl += 1;
+ if (pkt_dev->curfl >= pkt_dev->cflows)
+ pkt_dev->curfl = 0; /*reset */
+ }
+ } else {
+ flow = prandom_u32() % pkt_dev->cflows;
+ pkt_dev->curfl = flow;
+
+ if (pkt_dev->flows[flow].count > pkt_dev->lflow) {
+ pkt_dev->flows[flow].count = 0;
+ pkt_dev->flows[flow].flags = 0;
+ }
+ }
+
+ return pkt_dev->curfl;
+}
+
+
+#ifdef CONFIG_XFRM
+/* If there was already an IPSEC SA, we keep it as is, else
+ * we go look for it ...
+*/
+#define DUMMY_MARK 0
+static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
+{
+ struct xfrm_state *x = pkt_dev->flows[flow].x;
+ struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id);
+ if (!x) {
+
+ if (pkt_dev->spi) {
+ /* We need as quick as possible to find the right SA
+ * Searching with minimum criteria to archieve this.
+ */
+ x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET);
+ } else {
+ /* slow path: we dont already have xfrm_state */
+ x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0,
+ (xfrm_address_t *)&pkt_dev->cur_daddr,
+ (xfrm_address_t *)&pkt_dev->cur_saddr,
+ AF_INET,
+ pkt_dev->ipsmode,
+ pkt_dev->ipsproto, 0);
+ }
+ if (x) {
+ pkt_dev->flows[flow].x = x;
+ set_pkt_overhead(pkt_dev);
+ pkt_dev->pkt_overhead += x->props.header_len;
+ }
+
+ }
+}
+#endif
+static void set_cur_queue_map(struct pktgen_dev *pkt_dev)
+{
+
+ if (pkt_dev->flags & F_QUEUE_MAP_CPU)
+ pkt_dev->cur_queue_map = smp_processor_id();
+
+ else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) {
+ __u16 t;
+ if (pkt_dev->flags & F_QUEUE_MAP_RND) {
+ t = prandom_u32() %
+ (pkt_dev->queue_map_max -
+ pkt_dev->queue_map_min + 1)
+ + pkt_dev->queue_map_min;
+ } else {
+ t = pkt_dev->cur_queue_map + 1;
+ if (t > pkt_dev->queue_map_max)
+ t = pkt_dev->queue_map_min;
+ }
+ pkt_dev->cur_queue_map = t;
+ }
+ pkt_dev->cur_queue_map = pkt_dev->cur_queue_map % pkt_dev->odev->real_num_tx_queues;
+}
+
+/* Increment/randomize headers according to flags and current values
+ * for IP src/dest, UDP src/dst port, MAC-Addr src/dst
+ */
+static void mod_cur_headers(struct pktgen_dev *pkt_dev)
+{
+ __u32 imn;
+ __u32 imx;
+ int flow = 0;
+
+ if (pkt_dev->cflows)
+ flow = f_pick(pkt_dev);
+
+ /* Deal with source MAC */
+ if (pkt_dev->src_mac_count > 1) {
+ __u32 mc;
+ __u32 tmp;
+
+ if (pkt_dev->flags & F_MACSRC_RND)
+ mc = prandom_u32() % pkt_dev->src_mac_count;
+ else {
+ mc = pkt_dev->cur_src_mac_offset++;
+ if (pkt_dev->cur_src_mac_offset >=
+ pkt_dev->src_mac_count)
+ pkt_dev->cur_src_mac_offset = 0;
+ }
+
+ tmp = pkt_dev->src_mac[5] + (mc & 0xFF);
+ pkt_dev->hh[11] = tmp;
+ tmp = (pkt_dev->src_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[10] = tmp;
+ tmp = (pkt_dev->src_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[9] = tmp;
+ tmp = (pkt_dev->src_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[8] = tmp;
+ tmp = (pkt_dev->src_mac[1] + (tmp >> 8));
+ pkt_dev->hh[7] = tmp;
+ }
+
+ /* Deal with Destination MAC */
+ if (pkt_dev->dst_mac_count > 1) {
+ __u32 mc;
+ __u32 tmp;
+
+ if (pkt_dev->flags & F_MACDST_RND)
+ mc = prandom_u32() % pkt_dev->dst_mac_count;
+
+ else {
+ mc = pkt_dev->cur_dst_mac_offset++;
+ if (pkt_dev->cur_dst_mac_offset >=
+ pkt_dev->dst_mac_count) {
+ pkt_dev->cur_dst_mac_offset = 0;
+ }
+ }
+
+ tmp = pkt_dev->dst_mac[5] + (mc & 0xFF);
+ pkt_dev->hh[5] = tmp;
+ tmp = (pkt_dev->dst_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[4] = tmp;
+ tmp = (pkt_dev->dst_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[3] = tmp;
+ tmp = (pkt_dev->dst_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[2] = tmp;
+ tmp = (pkt_dev->dst_mac[1] + (tmp >> 8));
+ pkt_dev->hh[1] = tmp;
+ }
+
+ if (pkt_dev->flags & F_MPLS_RND) {
+ unsigned int i;
+ for (i = 0; i < pkt_dev->nr_labels; i++)
+ if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM)
+ pkt_dev->labels[i] = MPLS_STACK_BOTTOM |
+ ((__force __be32)prandom_u32() &
+ htonl(0x000fffff));
+ }
+
+ if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) {
+ pkt_dev->vlan_id = prandom_u32() & (4096 - 1);
+ }
+
+ if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) {
+ pkt_dev->svlan_id = prandom_u32() & (4096 - 1);
+ }
+
+ if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) {
+ if (pkt_dev->flags & F_UDPSRC_RND)
+ pkt_dev->cur_udp_src = prandom_u32() %
+ (pkt_dev->udp_src_max - pkt_dev->udp_src_min)
+ + pkt_dev->udp_src_min;
+
+ else {
+ pkt_dev->cur_udp_src++;
+ if (pkt_dev->cur_udp_src >= pkt_dev->udp_src_max)
+ pkt_dev->cur_udp_src = pkt_dev->udp_src_min;
+ }
+ }
+
+ if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) {
+ if (pkt_dev->flags & F_UDPDST_RND) {
+ pkt_dev->cur_udp_dst = prandom_u32() %
+ (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)
+ + pkt_dev->udp_dst_min;
+ } else {
+ pkt_dev->cur_udp_dst++;
+ if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max)
+ pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min;
+ }
+ }
+
+ if (!(pkt_dev->flags & F_IPV6)) {
+
+ imn = ntohl(pkt_dev->saddr_min);
+ imx = ntohl(pkt_dev->saddr_max);
+ if (imn < imx) {
+ __u32 t;
+ if (pkt_dev->flags & F_IPSRC_RND)
+ t = prandom_u32() % (imx - imn) + imn;
+ else {
+ t = ntohl(pkt_dev->cur_saddr);
+ t++;
+ if (t > imx)
+ t = imn;
+
+ }
+ pkt_dev->cur_saddr = htonl(t);
+ }
+
+ if (pkt_dev->cflows && f_seen(pkt_dev, flow)) {
+ pkt_dev->cur_daddr = pkt_dev->flows[flow].cur_daddr;
+ } else {
+ imn = ntohl(pkt_dev->daddr_min);
+ imx = ntohl(pkt_dev->daddr_max);
+ if (imn < imx) {
+ __u32 t;
+ __be32 s;
+ if (pkt_dev->flags & F_IPDST_RND) {
+
+ do {
+ t = prandom_u32() %
+ (imx - imn) + imn;
+ s = htonl(t);
+ } while (ipv4_is_loopback(s) ||
+ ipv4_is_multicast(s) ||
+ ipv4_is_lbcast(s) ||
+ ipv4_is_zeronet(s) ||
+ ipv4_is_local_multicast(s));
+ pkt_dev->cur_daddr = s;
+ } else {
+ t = ntohl(pkt_dev->cur_daddr);
+ t++;
+ if (t > imx) {
+ t = imn;
+ }
+ pkt_dev->cur_daddr = htonl(t);
+ }
+ }
+ if (pkt_dev->cflows) {
+ pkt_dev->flows[flow].flags |= F_INIT;
+ pkt_dev->flows[flow].cur_daddr =
+ pkt_dev->cur_daddr;
+#ifdef CONFIG_XFRM
+ if (pkt_dev->flags & F_IPSEC)
+ get_ipsec_sa(pkt_dev, flow);
+#endif
+ pkt_dev->nflows++;
+ }
+ }
+ } else { /* IPV6 * */
+
+ if (!ipv6_addr_any(&pkt_dev->min_in6_daddr)) {
+ int i;
+
+ /* Only random destinations yet */
+
+ for (i = 0; i < 4; i++) {
+ pkt_dev->cur_in6_daddr.s6_addr32[i] =
+ (((__force __be32)prandom_u32() |
+ pkt_dev->min_in6_daddr.s6_addr32[i]) &
+ pkt_dev->max_in6_daddr.s6_addr32[i]);
+ }
+ }
+ }
+
+ if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
+ __u32 t;
+ if (pkt_dev->flags & F_TXSIZE_RND) {
+ t = prandom_u32() %
+ (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size)
+ + pkt_dev->min_pkt_size;
+ } else {
+ t = pkt_dev->cur_pkt_size + 1;
+ if (t > pkt_dev->max_pkt_size)
+ t = pkt_dev->min_pkt_size;
+ }
+ pkt_dev->cur_pkt_size = t;
+ }
+
+ set_cur_queue_map(pkt_dev);
+
+ pkt_dev->flows[flow].count++;
+}
+
+
+#ifdef CONFIG_XFRM
+static u32 pktgen_dst_metrics[RTAX_MAX + 1] = {
+
+ [RTAX_HOPLIMIT] = 0x5, /* Set a static hoplimit */
+};
+
+static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
+{
+ struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
+ int err = 0;
+ struct net *net = dev_net(pkt_dev->odev);
+
+ if (!x)
+ return 0;
+ /* XXX: we dont support tunnel mode for now until
+ * we resolve the dst issue */
+ if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0))
+ return 0;
+
+ /* But when user specify an valid SPI, transformation
+ * supports both transport/tunnel mode + ESP/AH type.
+ */
+ if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0))
+ skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF;
+
+ rcu_read_lock_bh();
+ err = x->outer_mode->output(x, skb);
+ rcu_read_unlock_bh();
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR);
+ goto error;
+ }
+ err = x->type->output(x, skb);
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEPROTOERROR);
+ goto error;
+ }
+ spin_lock_bh(&x->lock);
+ x->curlft.bytes += skb->len;
+ x->curlft.packets++;
+ spin_unlock_bh(&x->lock);
+error:
+ return err;
+}
+
+static void free_SAs(struct pktgen_dev *pkt_dev)
+{
+ if (pkt_dev->cflows) {
+ /* let go of the SAs if we have them */
+ int i;
+ for (i = 0; i < pkt_dev->cflows; i++) {
+ struct xfrm_state *x = pkt_dev->flows[i].x;
+ if (x) {
+ xfrm_state_put(x);
+ pkt_dev->flows[i].x = NULL;
+ }
+ }
+ }
+}
+
+static int process_ipsec(struct pktgen_dev *pkt_dev,
+ struct sk_buff *skb, __be16 protocol)
+{
+ if (pkt_dev->flags & F_IPSEC) {
+ struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
+ int nhead = 0;
+ if (x) {
+ struct ethhdr *eth;
+ struct iphdr *iph;
+ int ret;
+
+ nhead = x->props.header_len - skb_headroom(skb);
+ if (nhead > 0) {
+ ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC);
+ if (ret < 0) {
+ pr_err("Error expanding ipsec packet %d\n",
+ ret);
+ goto err;
+ }
+ }
+
+ /* ipsec is not expecting ll header */
+ skb_pull(skb, ETH_HLEN);
+ ret = pktgen_output_ipsec(skb, pkt_dev);
+ if (ret) {
+ pr_err("Error creating ipsec packet %d\n", ret);
+ goto err;
+ }
+ /* restore ll */
+ eth = skb_push(skb, ETH_HLEN);
+ memcpy(eth, pkt_dev->hh, 2 * ETH_ALEN);
+ eth->h_proto = protocol;
+
+ /* Update IPv4 header len as well as checksum value */
+ iph = ip_hdr(skb);
+ iph->tot_len = htons(skb->len - ETH_HLEN);
+ ip_send_check(iph);
+ }
+ }
+ return 1;
+err:
+ kfree_skb(skb);
+ return 0;
+}
+#endif
+
+static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev)
+{
+ unsigned int i;
+ for (i = 0; i < pkt_dev->nr_labels; i++)
+ *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM;
+
+ mpls--;
+ *mpls |= MPLS_STACK_BOTTOM;
+}
+
+static inline __be16 build_tci(unsigned int id, unsigned int cfi,
+ unsigned int prio)
+{
+ return htons(id | (cfi << 12) | (prio << 13));
+}
+
+static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
+ int datalen)
+{
+ struct timespec64 timestamp;
+ struct pktgen_hdr *pgh;
+
+ pgh = skb_put(skb, sizeof(*pgh));
+ datalen -= sizeof(*pgh);
+
+ if (pkt_dev->nfrags <= 0) {
+ skb_put_zero(skb, datalen);
+ } else {
+ int frags = pkt_dev->nfrags;
+ int i, len;
+ int frag_len;
+
+
+ if (frags > MAX_SKB_FRAGS)
+ frags = MAX_SKB_FRAGS;
+ len = datalen - frags * PAGE_SIZE;
+ if (len > 0) {
+ skb_put_zero(skb, len);
+ datalen = frags * PAGE_SIZE;
+ }
+
+ i = 0;
+ frag_len = (datalen/frags) < PAGE_SIZE ?
+ (datalen/frags) : PAGE_SIZE;
+ while (datalen > 0) {
+ if (unlikely(!pkt_dev->page)) {
+ int node = numa_node_id();
+
+ if (pkt_dev->node >= 0 && (pkt_dev->flags & F_NODE))
+ node = pkt_dev->node;
+ pkt_dev->page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!pkt_dev->page)
+ break;
+ }
+ get_page(pkt_dev->page);
+ skb_frag_set_page(skb, i, pkt_dev->page);
+ skb_shinfo(skb)->frags[i].page_offset = 0;
+ /*last fragment, fill rest of data*/
+ if (i == (frags - 1))
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i],
+ (datalen < PAGE_SIZE ? datalen : PAGE_SIZE));
+ else
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len);
+ datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ i++;
+ skb_shinfo(skb)->nr_frags = i;
+ }
+ }
+
+ /* Stamp the time, and sequence number,
+ * convert them to network byte order
+ */
+ pgh->pgh_magic = htonl(PKTGEN_MAGIC);
+ pgh->seq_num = htonl(pkt_dev->seq_num);
+
+ if (pkt_dev->flags & F_NO_TIMESTAMP) {
+ pgh->tv_sec = 0;
+ pgh->tv_usec = 0;
+ } else {
+ /*
+ * pgh->tv_sec wraps in y2106 when interpreted as unsigned
+ * as done by wireshark, or y2038 when interpreted as signed.
+ * This is probably harmless, but if anyone wants to improve
+ * it, we could introduce a variant that puts 64-bit nanoseconds
+ * into the respective header bytes.
+ * This would also be slightly faster to read.
+ */
+ ktime_get_real_ts64(&timestamp);
+ pgh->tv_sec = htonl(timestamp.tv_sec);
+ pgh->tv_usec = htonl(timestamp.tv_nsec / NSEC_PER_USEC);
+ }
+}
+
+static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
+ struct pktgen_dev *pkt_dev)
+{
+ unsigned int extralen = LL_RESERVED_SPACE(dev);
+ struct sk_buff *skb = NULL;
+ unsigned int size;
+
+ size = pkt_dev->cur_pkt_size + 64 + extralen + pkt_dev->pkt_overhead;
+ if (pkt_dev->flags & F_NODE) {
+ int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id();
+
+ skb = __alloc_skb(NET_SKB_PAD + size, GFP_NOWAIT, 0, node);
+ if (likely(skb)) {
+ skb_reserve(skb, NET_SKB_PAD);
+ skb->dev = dev;
+ }
+ } else {
+ skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
+ }
+
+ /* the caller pre-fetches from skb->data and reserves for the mac hdr */
+ if (likely(skb))
+ skb_reserve(skb, extralen - 16);
+
+ return skb;
+}
+
+static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
+ struct pktgen_dev *pkt_dev)
+{
+ struct sk_buff *skb = NULL;
+ __u8 *eth;
+ struct udphdr *udph;
+ int datalen, iplen;
+ struct iphdr *iph;
+ __be16 protocol = htons(ETH_P_IP);
+ __be32 *mpls;
+ __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
+ __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */
+ __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */
+ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */
+ u16 queue_map;
+
+ if (pkt_dev->nr_labels)
+ protocol = htons(ETH_P_MPLS_UC);
+
+ if (pkt_dev->vlan_id != 0xffff)
+ protocol = htons(ETH_P_8021Q);
+
+ /* Update any of the values, used when we're incrementing various
+ * fields.
+ */
+ mod_cur_headers(pkt_dev);
+ queue_map = pkt_dev->cur_queue_map;
+
+ skb = pktgen_alloc_skb(odev, pkt_dev);
+ if (!skb) {
+ sprintf(pkt_dev->result, "No memory");
+ return NULL;
+ }
+
+ prefetchw(skb->data);
+ skb_reserve(skb, 16);
+
+ /* Reserve for ethernet and IP header */
+ eth = skb_push(skb, 14);
+ mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32));
+ if (pkt_dev->nr_labels)
+ mpls_push(mpls, pkt_dev);
+
+ if (pkt_dev->vlan_id != 0xffff) {
+ if (pkt_dev->svlan_id != 0xffff) {
+ svlan_tci = skb_put(skb, sizeof(__be16));
+ *svlan_tci = build_tci(pkt_dev->svlan_id,
+ pkt_dev->svlan_cfi,
+ pkt_dev->svlan_p);
+ svlan_encapsulated_proto = skb_put(skb,
+ sizeof(__be16));
+ *svlan_encapsulated_proto = htons(ETH_P_8021Q);
+ }
+ vlan_tci = skb_put(skb, sizeof(__be16));
+ *vlan_tci = build_tci(pkt_dev->vlan_id,
+ pkt_dev->vlan_cfi,
+ pkt_dev->vlan_p);
+ vlan_encapsulated_proto = skb_put(skb, sizeof(__be16));
+ *vlan_encapsulated_proto = htons(ETH_P_IP);
+ }
+
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb->len);
+ iph = skb_put(skb, sizeof(struct iphdr));
+
+ skb_set_transport_header(skb, skb->len);
+ udph = skb_put(skb, sizeof(struct udphdr));
+ skb_set_queue_mapping(skb, queue_map);
+ skb->priority = pkt_dev->skb_priority;
+
+ memcpy(eth, pkt_dev->hh, 12);
+ *(__be16 *) & eth[12] = protocol;
+
+ /* Eth + IPh + UDPh + mpls */
+ datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 -
+ pkt_dev->pkt_overhead;
+ if (datalen < 0 || datalen < sizeof(struct pktgen_hdr))
+ datalen = sizeof(struct pktgen_hdr);
+
+ udph->source = htons(pkt_dev->cur_udp_src);
+ udph->dest = htons(pkt_dev->cur_udp_dst);
+ udph->len = htons(datalen + 8); /* DATA + udphdr */
+ udph->check = 0;
+
+ iph->ihl = 5;
+ iph->version = 4;
+ iph->ttl = 32;
+ iph->tos = pkt_dev->tos;
+ iph->protocol = IPPROTO_UDP; /* UDP */
+ iph->saddr = pkt_dev->cur_saddr;
+ iph->daddr = pkt_dev->cur_daddr;
+ iph->id = htons(pkt_dev->ip_id);
+ pkt_dev->ip_id++;
+ iph->frag_off = 0;
+ iplen = 20 + 8 + datalen;
+ iph->tot_len = htons(iplen);
+ ip_send_check(iph);
+ skb->protocol = protocol;
+ skb->dev = odev;
+ skb->pkt_type = PACKET_HOST;
+
+ pktgen_finalize_skb(pkt_dev, skb, datalen);
+
+ if (!(pkt_dev->flags & F_UDPCSUM)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum = 0;
+ udp4_hwcsum(skb, iph->saddr, iph->daddr);
+ } else {
+ __wsum csum = skb_checksum(skb, skb_transport_offset(skb), datalen + 8, 0);
+
+ /* add protocol-dependent pseudo-header */
+ udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen + 8, IPPROTO_UDP, csum);
+
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+#ifdef CONFIG_XFRM
+ if (!process_ipsec(pkt_dev, skb, protocol))
+ return NULL;
+#endif
+
+ return skb;
+}
+
+static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
+ struct pktgen_dev *pkt_dev)
+{
+ struct sk_buff *skb = NULL;
+ __u8 *eth;
+ struct udphdr *udph;
+ int datalen, udplen;
+ struct ipv6hdr *iph;
+ __be16 protocol = htons(ETH_P_IPV6);
+ __be32 *mpls;
+ __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
+ __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */
+ __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */
+ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */
+ u16 queue_map;
+
+ if (pkt_dev->nr_labels)
+ protocol = htons(ETH_P_MPLS_UC);
+
+ if (pkt_dev->vlan_id != 0xffff)
+ protocol = htons(ETH_P_8021Q);
+
+ /* Update any of the values, used when we're incrementing various
+ * fields.
+ */
+ mod_cur_headers(pkt_dev);
+ queue_map = pkt_dev->cur_queue_map;
+
+ skb = pktgen_alloc_skb(odev, pkt_dev);
+ if (!skb) {
+ sprintf(pkt_dev->result, "No memory");
+ return NULL;
+ }
+
+ prefetchw(skb->data);
+ skb_reserve(skb, 16);
+
+ /* Reserve for ethernet and IP header */
+ eth = skb_push(skb, 14);
+ mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32));
+ if (pkt_dev->nr_labels)
+ mpls_push(mpls, pkt_dev);
+
+ if (pkt_dev->vlan_id != 0xffff) {
+ if (pkt_dev->svlan_id != 0xffff) {
+ svlan_tci = skb_put(skb, sizeof(__be16));
+ *svlan_tci = build_tci(pkt_dev->svlan_id,
+ pkt_dev->svlan_cfi,
+ pkt_dev->svlan_p);
+ svlan_encapsulated_proto = skb_put(skb,
+ sizeof(__be16));
+ *svlan_encapsulated_proto = htons(ETH_P_8021Q);
+ }
+ vlan_tci = skb_put(skb, sizeof(__be16));
+ *vlan_tci = build_tci(pkt_dev->vlan_id,
+ pkt_dev->vlan_cfi,
+ pkt_dev->vlan_p);
+ vlan_encapsulated_proto = skb_put(skb, sizeof(__be16));
+ *vlan_encapsulated_proto = htons(ETH_P_IPV6);
+ }
+
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb->len);
+ iph = skb_put(skb, sizeof(struct ipv6hdr));
+
+ skb_set_transport_header(skb, skb->len);
+ udph = skb_put(skb, sizeof(struct udphdr));
+ skb_set_queue_mapping(skb, queue_map);
+ skb->priority = pkt_dev->skb_priority;
+
+ memcpy(eth, pkt_dev->hh, 12);
+ *(__be16 *) &eth[12] = protocol;
+
+ /* Eth + IPh + UDPh + mpls */
+ datalen = pkt_dev->cur_pkt_size - 14 -
+ sizeof(struct ipv6hdr) - sizeof(struct udphdr) -
+ pkt_dev->pkt_overhead;
+
+ if (datalen < 0 || datalen < sizeof(struct pktgen_hdr)) {
+ datalen = sizeof(struct pktgen_hdr);
+ net_info_ratelimited("increased datalen to %d\n", datalen);
+ }
+
+ udplen = datalen + sizeof(struct udphdr);
+ udph->source = htons(pkt_dev->cur_udp_src);
+ udph->dest = htons(pkt_dev->cur_udp_dst);
+ udph->len = htons(udplen);
+ udph->check = 0;
+
+ *(__be32 *) iph = htonl(0x60000000); /* Version + flow */
+
+ if (pkt_dev->traffic_class) {
+ /* Version + traffic class + flow (0) */
+ *(__be32 *)iph |= htonl(0x60000000 | (pkt_dev->traffic_class << 20));
+ }
+
+ iph->hop_limit = 32;
+
+ iph->payload_len = htons(udplen);
+ iph->nexthdr = IPPROTO_UDP;
+
+ iph->daddr = pkt_dev->cur_in6_daddr;
+ iph->saddr = pkt_dev->cur_in6_saddr;
+
+ skb->protocol = protocol;
+ skb->dev = odev;
+ skb->pkt_type = PACKET_HOST;
+
+ pktgen_finalize_skb(pkt_dev, skb, datalen);
+
+ if (!(pkt_dev->flags & F_UDPCSUM)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM)) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ udph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, 0);
+ } else {
+ __wsum csum = skb_checksum(skb, skb_transport_offset(skb), udplen, 0);
+
+ /* add protocol-dependent pseudo-header */
+ udph->check = csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, csum);
+
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ return skb;
+}
+
+static struct sk_buff *fill_packet(struct net_device *odev,
+ struct pktgen_dev *pkt_dev)
+{
+ if (pkt_dev->flags & F_IPV6)
+ return fill_packet_ipv6(odev, pkt_dev);
+ else
+ return fill_packet_ipv4(odev, pkt_dev);
+}
+
+static void pktgen_clear_counters(struct pktgen_dev *pkt_dev)
+{
+ pkt_dev->seq_num = 1;
+ pkt_dev->idle_acc = 0;
+ pkt_dev->sofar = 0;
+ pkt_dev->tx_bytes = 0;
+ pkt_dev->errors = 0;
+}
+
+/* Set up structure for sending pkts, clear counters */
+
+static void pktgen_run(struct pktgen_thread *t)
+{
+ struct pktgen_dev *pkt_dev;
+ int started = 0;
+
+ func_enter();
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
+
+ /*
+ * setup odev and create initial packet.
+ */
+ pktgen_setup_inject(pkt_dev);
+
+ if (pkt_dev->odev) {
+ pktgen_clear_counters(pkt_dev);
+ pkt_dev->skb = NULL;
+ pkt_dev->started_at = pkt_dev->next_tx = ktime_get();
+
+ set_pkt_overhead(pkt_dev);
+
+ strcpy(pkt_dev->result, "Starting");
+ pkt_dev->running = 1; /* Cranke yeself! */
+ started++;
+ } else
+ strcpy(pkt_dev->result, "Error starting");
+ }
+ rcu_read_unlock();
+ if (started)
+ t->control &= ~(T_STOP);
+}
+
+static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn)
+{
+ struct pktgen_thread *t;
+
+ func_enter();
+
+ mutex_lock(&pktgen_thread_lock);
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
+ t->control |= T_STOP;
+
+ mutex_unlock(&pktgen_thread_lock);
+}
+
+static int thread_is_running(const struct pktgen_thread *t)
+{
+ const struct pktgen_dev *pkt_dev;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
+ if (pkt_dev->running) {
+ rcu_read_unlock();
+ return 1;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+static int pktgen_wait_thread_run(struct pktgen_thread *t)
+{
+ while (thread_is_running(t)) {
+
+ /* note: 't' will still be around even after the unlock/lock
+ * cycle because pktgen_thread threads are only cleared at
+ * net exit
+ */
+ mutex_unlock(&pktgen_thread_lock);
+ msleep_interruptible(100);
+ mutex_lock(&pktgen_thread_lock);
+
+ if (signal_pending(current))
+ goto signal;
+ }
+ return 1;
+signal:
+ return 0;
+}
+
+static int pktgen_wait_all_threads_run(struct pktgen_net *pn)
+{
+ struct pktgen_thread *t;
+ int sig = 1;
+
+ /* prevent from racing with rmmod */
+ if (!try_module_get(THIS_MODULE))
+ return sig;
+
+ mutex_lock(&pktgen_thread_lock);
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list) {
+ sig = pktgen_wait_thread_run(t);
+ if (sig == 0)
+ break;
+ }
+
+ if (sig == 0)
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
+ t->control |= (T_STOP);
+
+ mutex_unlock(&pktgen_thread_lock);
+ module_put(THIS_MODULE);
+ return sig;
+}
+
+static void pktgen_run_all_threads(struct pktgen_net *pn)
+{
+ struct pktgen_thread *t;
+
+ func_enter();
+
+ mutex_lock(&pktgen_thread_lock);
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
+ t->control |= (T_RUN);
+
+ mutex_unlock(&pktgen_thread_lock);
+
+ /* Propagate thread->control */
+ schedule_timeout_interruptible(msecs_to_jiffies(125));
+
+ pktgen_wait_all_threads_run(pn);
+}
+
+static void pktgen_reset_all_threads(struct pktgen_net *pn)
+{
+ struct pktgen_thread *t;
+
+ func_enter();
+
+ mutex_lock(&pktgen_thread_lock);
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
+ t->control |= (T_REMDEVALL);
+
+ mutex_unlock(&pktgen_thread_lock);
+
+ /* Propagate thread->control */
+ schedule_timeout_interruptible(msecs_to_jiffies(125));
+
+ pktgen_wait_all_threads_run(pn);
+}
+
+static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
+{
+ __u64 bps, mbps, pps;
+ char *p = pkt_dev->result;
+ ktime_t elapsed = ktime_sub(pkt_dev->stopped_at,
+ pkt_dev->started_at);
+ ktime_t idle = ns_to_ktime(pkt_dev->idle_acc);
+
+ p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n",
+ (unsigned long long)ktime_to_us(elapsed),
+ (unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)),
+ (unsigned long long)ktime_to_us(idle),
+ (unsigned long long)pkt_dev->sofar,
+ pkt_dev->cur_pkt_size, nr_frags);
+
+ pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC,
+ ktime_to_ns(elapsed));
+
+ bps = pps * 8 * pkt_dev->cur_pkt_size;
+
+ mbps = bps;
+ do_div(mbps, 1000000);
+ p += sprintf(p, " %llupps %lluMb/sec (%llubps) errors: %llu",
+ (unsigned long long)pps,
+ (unsigned long long)mbps,
+ (unsigned long long)bps,
+ (unsigned long long)pkt_dev->errors);
+}
+
+/* Set stopped-at timer, remove from running list, do counters & statistics */
+static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
+{
+ int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1;
+
+ if (!pkt_dev->running) {
+ pr_warn("interface: %s is already stopped\n",
+ pkt_dev->odevname);
+ return -EINVAL;
+ }
+
+ pkt_dev->running = 0;
+ kfree_skb(pkt_dev->skb);
+ pkt_dev->skb = NULL;
+ pkt_dev->stopped_at = ktime_get();
+
+ show_results(pkt_dev, nr_frags);
+
+ return 0;
+}
+
+static struct pktgen_dev *next_to_run(struct pktgen_thread *t)
+{
+ struct pktgen_dev *pkt_dev, *best = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
+ if (!pkt_dev->running)
+ continue;
+ if (best == NULL)
+ best = pkt_dev;
+ else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0)
+ best = pkt_dev;
+ }
+ rcu_read_unlock();
+
+ return best;
+}
+
+static void pktgen_stop(struct pktgen_thread *t)
+{
+ struct pktgen_dev *pkt_dev;
+
+ func_enter();
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
+ pktgen_stop_device(pkt_dev);
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * one of our devices needs to be removed - find it
+ * and remove it
+ */
+static void pktgen_rem_one_if(struct pktgen_thread *t)
+{
+ struct list_head *q, *n;
+ struct pktgen_dev *cur;
+
+ func_enter();
+
+ list_for_each_safe(q, n, &t->if_list) {
+ cur = list_entry(q, struct pktgen_dev, list);
+
+ if (!cur->removal_mark)
+ continue;
+
+ kfree_skb(cur->skb);
+ cur->skb = NULL;
+
+ pktgen_remove_device(t, cur);
+
+ break;
+ }
+}
+
+static void pktgen_rem_all_ifs(struct pktgen_thread *t)
+{
+ struct list_head *q, *n;
+ struct pktgen_dev *cur;
+
+ func_enter();
+
+ /* Remove all devices, free mem */
+
+ list_for_each_safe(q, n, &t->if_list) {
+ cur = list_entry(q, struct pktgen_dev, list);
+
+ kfree_skb(cur->skb);
+ cur->skb = NULL;
+
+ pktgen_remove_device(t, cur);
+ }
+}
+
+static void pktgen_rem_thread(struct pktgen_thread *t)
+{
+ /* Remove from the thread list */
+ remove_proc_entry(t->tsk->comm, t->net->proc_dir);
+}
+
+static void pktgen_resched(struct pktgen_dev *pkt_dev)
+{
+ ktime_t idle_start = ktime_get();
+ schedule();
+ pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
+}
+
+static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
+{
+ ktime_t idle_start = ktime_get();
+
+ while (refcount_read(&(pkt_dev->skb->users)) != 1) {
+ if (signal_pending(current))
+ break;
+
+ if (need_resched())
+ pktgen_resched(pkt_dev);
+ else
+ cpu_relax();
+ }
+ pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
+}
+
+static void pktgen_xmit(struct pktgen_dev *pkt_dev)
+{
+ unsigned int burst = READ_ONCE(pkt_dev->burst);
+ struct net_device *odev = pkt_dev->odev;
+ struct netdev_queue *txq;
+ struct sk_buff *skb;
+ int ret;
+
+ /* If device is offline, then don't send */
+ if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) {
+ pktgen_stop_device(pkt_dev);
+ return;
+ }
+
+ /* This is max DELAY, this has special meaning of
+ * "never transmit"
+ */
+ if (unlikely(pkt_dev->delay == ULLONG_MAX)) {
+ pkt_dev->next_tx = ktime_add_ns(ktime_get(), ULONG_MAX);
+ return;
+ }
+
+ /* If no skb or clone count exhausted then get new one */
+ if (!pkt_dev->skb || (pkt_dev->last_ok &&
+ ++pkt_dev->clone_count >= pkt_dev->clone_skb)) {
+ /* build a new pkt */
+ kfree_skb(pkt_dev->skb);
+
+ pkt_dev->skb = fill_packet(odev, pkt_dev);
+ if (pkt_dev->skb == NULL) {
+ pr_err("ERROR: couldn't allocate skb in fill_packet\n");
+ schedule();
+ pkt_dev->clone_count--; /* back out increment, OOM */
+ return;
+ }
+ pkt_dev->last_pkt_size = pkt_dev->skb->len;
+ pkt_dev->clone_count = 0; /* reset counter */
+ }
+
+ if (pkt_dev->delay && pkt_dev->last_ok)
+ spin(pkt_dev, pkt_dev->next_tx);
+
+ if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) {
+ skb = pkt_dev->skb;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ refcount_add(burst, &skb->users);
+ local_bh_disable();
+ do {
+ ret = netif_receive_skb(skb);
+ if (ret == NET_RX_DROP)
+ pkt_dev->errors++;
+ pkt_dev->sofar++;
+ pkt_dev->seq_num++;
+ if (refcount_read(&skb->users) != burst) {
+ /* skb was queued by rps/rfs or taps,
+ * so cannot reuse this skb
+ */
+ WARN_ON(refcount_sub_and_test(burst - 1, &skb->users));
+ /* get out of the loop and wait
+ * until skb is consumed
+ */
+ break;
+ }
+ /* skb was 'freed' by stack, so clean few
+ * bits and reuse it
+ */
+ skb_reset_tc(skb);
+ } while (--burst > 0);
+ goto out; /* Skips xmit_mode M_START_XMIT */
+ } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
+ local_bh_disable();
+ refcount_inc(&pkt_dev->skb->users);
+
+ ret = dev_queue_xmit(pkt_dev->skb);
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ pkt_dev->sofar++;
+ pkt_dev->seq_num++;
+ pkt_dev->tx_bytes += pkt_dev->last_pkt_size;
+ break;
+ case NET_XMIT_DROP:
+ case NET_XMIT_CN:
+ /* These are all valid return codes for a qdisc but
+ * indicate packets are being dropped or will likely
+ * be dropped soon.
+ */
+ case NETDEV_TX_BUSY:
+ /* qdisc may call dev_hard_start_xmit directly in cases
+ * where no queues exist e.g. loopback device, virtual
+ * devices, etc. In this case we need to handle
+ * NETDEV_TX_ codes.
+ */
+ default:
+ pkt_dev->errors++;
+ net_info_ratelimited("%s xmit error: %d\n",
+ pkt_dev->odevname, ret);
+ break;
+ }
+ goto out;
+ }
+
+ txq = skb_get_tx_queue(odev, pkt_dev->skb);
+
+ local_bh_disable();
+
+ HARD_TX_LOCK(odev, txq, smp_processor_id());
+
+ if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) {
+ ret = NETDEV_TX_BUSY;
+ pkt_dev->last_ok = 0;
+ goto unlock;
+ }
+ refcount_add(burst, &pkt_dev->skb->users);
+
+xmit_more:
+ ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0);
+
+ switch (ret) {
+ case NETDEV_TX_OK:
+ pkt_dev->last_ok = 1;
+ pkt_dev->sofar++;
+ pkt_dev->seq_num++;
+ pkt_dev->tx_bytes += pkt_dev->last_pkt_size;
+ if (burst > 0 && !netif_xmit_frozen_or_drv_stopped(txq))
+ goto xmit_more;
+ break;
+ case NET_XMIT_DROP:
+ case NET_XMIT_CN:
+ /* skb has been consumed */
+ pkt_dev->errors++;
+ break;
+ default: /* Drivers are not supposed to return other values! */
+ net_info_ratelimited("%s xmit error: %d\n",
+ pkt_dev->odevname, ret);
+ pkt_dev->errors++;
+ /* fallthru */
+ case NETDEV_TX_BUSY:
+ /* Retry it next time */
+ refcount_dec(&(pkt_dev->skb->users));
+ pkt_dev->last_ok = 0;
+ }
+ if (unlikely(burst))
+ WARN_ON(refcount_sub_and_test(burst, &pkt_dev->skb->users));
+unlock:
+ HARD_TX_UNLOCK(odev, txq);
+
+out:
+ local_bh_enable();
+
+ /* If pkt_dev->count is zero, then run forever */
+ if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
+ pktgen_wait_for_skb(pkt_dev);
+
+ /* Done with this */
+ pktgen_stop_device(pkt_dev);
+ }
+}
+
+/*
+ * Main loop of the thread goes here
+ */
+
+static int pktgen_thread_worker(void *arg)
+{
+ DEFINE_WAIT(wait);
+ struct pktgen_thread *t = arg;
+ struct pktgen_dev *pkt_dev = NULL;
+ int cpu = t->cpu;
+
+ WARN_ON(smp_processor_id() != cpu);
+
+ init_waitqueue_head(&t->queue);
+ complete(&t->start_done);
+
+ pr_debug("starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current));
+
+ set_freezable();
+
+ while (!kthread_should_stop()) {
+ pkt_dev = next_to_run(t);
+
+ if (unlikely(!pkt_dev && t->control == 0)) {
+ if (t->net->pktgen_exiting)
+ break;
+ wait_event_interruptible_timeout(t->queue,
+ t->control != 0,
+ HZ/10);
+ try_to_freeze();
+ continue;
+ }
+
+ if (likely(pkt_dev)) {
+ pktgen_xmit(pkt_dev);
+
+ if (need_resched())
+ pktgen_resched(pkt_dev);
+ else
+ cpu_relax();
+ }
+
+ if (t->control & T_STOP) {
+ pktgen_stop(t);
+ t->control &= ~(T_STOP);
+ }
+
+ if (t->control & T_RUN) {
+ pktgen_run(t);
+ t->control &= ~(T_RUN);
+ }
+
+ if (t->control & T_REMDEVALL) {
+ pktgen_rem_all_ifs(t);
+ t->control &= ~(T_REMDEVALL);
+ }
+
+ if (t->control & T_REMDEV) {
+ pktgen_rem_one_if(t);
+ t->control &= ~(T_REMDEV);
+ }
+
+ try_to_freeze();
+ }
+
+ pr_debug("%s stopping all device\n", t->tsk->comm);
+ pktgen_stop(t);
+
+ pr_debug("%s removing all device\n", t->tsk->comm);
+ pktgen_rem_all_ifs(t);
+
+ pr_debug("%s removing thread\n", t->tsk->comm);
+ pktgen_rem_thread(t);
+
+ return 0;
+}
+
+static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
+ const char *ifname, bool exact)
+{
+ struct pktgen_dev *p, *pkt_dev = NULL;
+ size_t len = strlen(ifname);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(p, &t->if_list, list)
+ if (strncmp(p->odevname, ifname, len) == 0) {
+ if (p->odevname[len]) {
+ if (exact || p->odevname[len] != '@')
+ continue;
+ }
+ pkt_dev = p;
+ break;
+ }
+
+ rcu_read_unlock();
+ pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev);
+ return pkt_dev;
+}
+
+/*
+ * Adds a dev at front of if_list.
+ */
+
+static int add_dev_to_thread(struct pktgen_thread *t,
+ struct pktgen_dev *pkt_dev)
+{
+ int rv = 0;
+
+ /* This function cannot be called concurrently, as its called
+ * under pktgen_thread_lock mutex, but it can run from
+ * userspace on another CPU than the kthread. The if_lock()
+ * is used here to sync with concurrent instances of
+ * _rem_dev_from_if_list() invoked via kthread, which is also
+ * updating the if_list */
+ if_lock(t);
+
+ if (pkt_dev->pg_thread) {
+ pr_err("ERROR: already assigned to a thread\n");
+ rv = -EBUSY;
+ goto out;
+ }
+
+ pkt_dev->running = 0;
+ pkt_dev->pg_thread = t;
+ list_add_rcu(&pkt_dev->list, &t->if_list);
+
+out:
+ if_unlock(t);
+ return rv;
+}
+
+/* Called under thread lock */
+
+static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
+{
+ struct pktgen_dev *pkt_dev;
+ int err;
+ int node = cpu_to_node(t->cpu);
+
+ /* We don't allow a device to be on several threads */
+
+ pkt_dev = __pktgen_NN_threads(t->net, ifname, FIND);
+ if (pkt_dev) {
+ pr_err("ERROR: interface already used\n");
+ return -EBUSY;
+ }
+
+ pkt_dev = kzalloc_node(sizeof(struct pktgen_dev), GFP_KERNEL, node);
+ if (!pkt_dev)
+ return -ENOMEM;
+
+ strcpy(pkt_dev->odevname, ifname);
+ pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS,
+ sizeof(struct flow_state)),
+ node);
+ if (pkt_dev->flows == NULL) {
+ kfree(pkt_dev);
+ return -ENOMEM;
+ }
+
+ pkt_dev->removal_mark = 0;
+ pkt_dev->nfrags = 0;
+ pkt_dev->delay = pg_delay_d;
+ pkt_dev->count = pg_count_d;
+ pkt_dev->sofar = 0;
+ pkt_dev->udp_src_min = 9; /* sink port */
+ pkt_dev->udp_src_max = 9;
+ pkt_dev->udp_dst_min = 9;
+ pkt_dev->udp_dst_max = 9;
+ pkt_dev->vlan_p = 0;
+ pkt_dev->vlan_cfi = 0;
+ pkt_dev->vlan_id = 0xffff;
+ pkt_dev->svlan_p = 0;
+ pkt_dev->svlan_cfi = 0;
+ pkt_dev->svlan_id = 0xffff;
+ pkt_dev->burst = 1;
+ pkt_dev->node = -1;
+
+ err = pktgen_setup_dev(t->net, pkt_dev, ifname);
+ if (err)
+ goto out1;
+ if (pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)
+ pkt_dev->clone_skb = pg_clone_skb_d;
+
+ pkt_dev->entry = proc_create_data(ifname, 0600, t->net->proc_dir,
+ &pktgen_if_fops, pkt_dev);
+ if (!pkt_dev->entry) {
+ pr_err("cannot create %s/%s procfs entry\n",
+ PG_PROC_DIR, ifname);
+ err = -EINVAL;
+ goto out2;
+ }
+#ifdef CONFIG_XFRM
+ pkt_dev->ipsmode = XFRM_MODE_TRANSPORT;
+ pkt_dev->ipsproto = IPPROTO_ESP;
+
+ /* xfrm tunnel mode needs additional dst to extract outter
+ * ip header protocol/ttl/id field, here creat a phony one.
+ * instead of looking for a valid rt, which definitely hurting
+ * performance under such circumstance.
+ */
+ pkt_dev->dstops.family = AF_INET;
+ pkt_dev->xdst.u.dst.dev = pkt_dev->odev;
+ dst_init_metrics(&pkt_dev->xdst.u.dst, pktgen_dst_metrics, false);
+ pkt_dev->xdst.child = &pkt_dev->xdst.u.dst;
+ pkt_dev->xdst.u.dst.ops = &pkt_dev->dstops;
+#endif
+
+ return add_dev_to_thread(t, pkt_dev);
+out2:
+ dev_put(pkt_dev->odev);
+out1:
+#ifdef CONFIG_XFRM
+ free_SAs(pkt_dev);
+#endif
+ vfree(pkt_dev->flows);
+ kfree(pkt_dev);
+ return err;
+}
+
+static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
+{
+ struct pktgen_thread *t;
+ struct proc_dir_entry *pe;
+ struct task_struct *p;
+
+ t = kzalloc_node(sizeof(struct pktgen_thread), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!t) {
+ pr_err("ERROR: out of memory, can't create new thread\n");
+ return -ENOMEM;
+ }
+
+ mutex_init(&t->if_lock);
+ t->cpu = cpu;
+
+ INIT_LIST_HEAD(&t->if_list);
+
+ list_add_tail(&t->th_list, &pn->pktgen_threads);
+ init_completion(&t->start_done);
+
+ p = kthread_create_on_node(pktgen_thread_worker,
+ t,
+ cpu_to_node(cpu),
+ "kpktgend_%d", cpu);
+ if (IS_ERR(p)) {
+ pr_err("kernel_thread() failed for cpu %d\n", t->cpu);
+ list_del(&t->th_list);
+ kfree(t);
+ return PTR_ERR(p);
+ }
+ kthread_bind(p, cpu);
+ t->tsk = p;
+
+ pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir,
+ &pktgen_thread_fops, t);
+ if (!pe) {
+ pr_err("cannot create %s/%s procfs entry\n",
+ PG_PROC_DIR, t->tsk->comm);
+ kthread_stop(p);
+ list_del(&t->th_list);
+ kfree(t);
+ return -EINVAL;
+ }
+
+ t->net = pn;
+ get_task_struct(p);
+ wake_up_process(p);
+ wait_for_completion(&t->start_done);
+
+ return 0;
+}
+
+/*
+ * Removes a device from the thread if_list.
+ */
+static void _rem_dev_from_if_list(struct pktgen_thread *t,
+ struct pktgen_dev *pkt_dev)
+{
+ struct list_head *q, *n;
+ struct pktgen_dev *p;
+
+ if_lock(t);
+ list_for_each_safe(q, n, &t->if_list) {
+ p = list_entry(q, struct pktgen_dev, list);
+ if (p == pkt_dev)
+ list_del_rcu(&p->list);
+ }
+ if_unlock(t);
+}
+
+static int pktgen_remove_device(struct pktgen_thread *t,
+ struct pktgen_dev *pkt_dev)
+{
+ pr_debug("remove_device pkt_dev=%p\n", pkt_dev);
+
+ if (pkt_dev->running) {
+ pr_warn("WARNING: trying to remove a running interface, stopping it now\n");
+ pktgen_stop_device(pkt_dev);
+ }
+
+ /* Dis-associate from the interface */
+
+ if (pkt_dev->odev) {
+ dev_put(pkt_dev->odev);
+ pkt_dev->odev = NULL;
+ }
+
+ /* Remove proc before if_list entry, because add_device uses
+ * list to determine if interface already exist, avoid race
+ * with proc_create_data() */
+ proc_remove(pkt_dev->entry);
+
+ /* And update the thread if_list */
+ _rem_dev_from_if_list(t, pkt_dev);
+
+#ifdef CONFIG_XFRM
+ free_SAs(pkt_dev);
+#endif
+ vfree(pkt_dev->flows);
+ if (pkt_dev->page)
+ put_page(pkt_dev->page);
+ kfree_rcu(pkt_dev, rcu);
+ return 0;
+}
+
+static int __net_init pg_net_init(struct net *net)
+{
+ struct pktgen_net *pn = net_generic(net, pg_net_id);
+ struct proc_dir_entry *pe;
+ int cpu, ret = 0;
+
+ pn->net = net;
+ INIT_LIST_HEAD(&pn->pktgen_threads);
+ pn->pktgen_exiting = false;
+ pn->proc_dir = proc_mkdir(PG_PROC_DIR, pn->net->proc_net);
+ if (!pn->proc_dir) {
+ pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR);
+ return -ENODEV;
+ }
+ pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_fops);
+ if (pe == NULL) {
+ pr_err("cannot create %s procfs entry\n", PGCTRL);
+ ret = -EINVAL;
+ goto remove;
+ }
+
+ for_each_online_cpu(cpu) {
+ int err;
+
+ err = pktgen_create_thread(cpu, pn);
+ if (err)
+ pr_warn("Cannot create thread for cpu %d (%d)\n",
+ cpu, err);
+ }
+
+ if (list_empty(&pn->pktgen_threads)) {
+ pr_err("Initialization failed for all threads\n");
+ ret = -ENODEV;
+ goto remove_entry;
+ }
+
+ return 0;
+
+remove_entry:
+ remove_proc_entry(PGCTRL, pn->proc_dir);
+remove:
+ remove_proc_entry(PG_PROC_DIR, pn->net->proc_net);
+ return ret;
+}
+
+static void __net_exit pg_net_exit(struct net *net)
+{
+ struct pktgen_net *pn = net_generic(net, pg_net_id);
+ struct pktgen_thread *t;
+ struct list_head *q, *n;
+ LIST_HEAD(list);
+
+ /* Stop all interfaces & threads */
+ pn->pktgen_exiting = true;
+
+ mutex_lock(&pktgen_thread_lock);
+ list_splice_init(&pn->pktgen_threads, &list);
+ mutex_unlock(&pktgen_thread_lock);
+
+ list_for_each_safe(q, n, &list) {
+ t = list_entry(q, struct pktgen_thread, th_list);
+ list_del(&t->th_list);
+ kthread_stop(t->tsk);
+ put_task_struct(t->tsk);
+ kfree(t);
+ }
+
+ remove_proc_entry(PGCTRL, pn->proc_dir);
+ remove_proc_entry(PG_PROC_DIR, pn->net->proc_net);
+}
+
+static struct pernet_operations pg_net_ops = {
+ .init = pg_net_init,
+ .exit = pg_net_exit,
+ .id = &pg_net_id,
+ .size = sizeof(struct pktgen_net),
+};
+
+static int __init pg_init(void)
+{
+ int ret = 0;
+
+ pr_info("%s", version);
+ ret = register_pernet_subsys(&pg_net_ops);
+ if (ret)
+ return ret;
+ ret = register_netdevice_notifier(&pktgen_notifier_block);
+ if (ret)
+ unregister_pernet_subsys(&pg_net_ops);
+
+ return ret;
+}
+
+static void __exit pg_cleanup(void)
+{
+ unregister_netdevice_notifier(&pktgen_notifier_block);
+ unregister_pernet_subsys(&pg_net_ops);
+ /* Don't need rcu_barrier() due to use of kfree_rcu() */
+}
+
+module_init(pg_init);
+module_exit(pg_cleanup);
+
+MODULE_AUTHOR("Robert Olsson <robert.olsson@its.uu.se>");
+MODULE_DESCRIPTION("Packet Generator tool");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(VERSION);
+module_param(pg_count_d, int, 0);
+MODULE_PARM_DESC(pg_count_d, "Default number of packets to inject");
+module_param(pg_delay_d, int, 0);
+MODULE_PARM_DESC(pg_delay_d, "Default delay between packets (nanoseconds)");
+module_param(pg_clone_skb_d, int, 0);
+MODULE_PARM_DESC(pg_clone_skb_d, "Default number of copies of the same packet");
+module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "Enable debugging of pktgen module");
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
new file mode 100644
index 000000000..703cf76aa
--- /dev/null
+++ b/net/core/ptp_classifier.c
@@ -0,0 +1,193 @@
+/* PTP classifier
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* The below program is the bpf_asm (tools/net/) representation of
+ * the opcode array in the ptp_filter structure.
+ *
+ * For convenience, this can easily be altered and reviewed with
+ * bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a
+ * simple file containing the below program:
+ *
+ * ldh [12] ; load ethertype
+ *
+ * ; PTP over UDP over IPv4 over Ethernet
+ * test_ipv4:
+ * jneq #0x800, test_ipv6 ; ETH_P_IP ?
+ * ldb [23] ; load proto
+ * jneq #17, drop_ipv4 ; IPPROTO_UDP ?
+ * ldh [20] ; load frag offset field
+ * jset #0x1fff, drop_ipv4 ; don't allow fragments
+ * ldxb 4*([14]&0xf) ; load IP header len
+ * ldh [x + 16] ; load UDP dst port
+ * jneq #319, drop_ipv4 ; is port PTP_EV_PORT ?
+ * ldh [x + 22] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x10 ; PTP_CLASS_IPV4
+ * ret a ; return PTP class
+ * drop_ipv4: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over UDP over IPv6 over Ethernet
+ * test_ipv6:
+ * jneq #0x86dd, test_8021q ; ETH_P_IPV6 ?
+ * ldb [20] ; load proto
+ * jneq #17, drop_ipv6 ; IPPROTO_UDP ?
+ * ldh [56] ; load UDP dst port
+ * jneq #319, drop_ipv6 ; is port PTP_EV_PORT ?
+ * ldh [62] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x20 ; PTP_CLASS_IPV6
+ * ret a ; return PTP class
+ * drop_ipv6: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over 802.1Q over Ethernet
+ * test_8021q:
+ * jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ?
+ * ldh [16] ; load inner type
+ * jneq #0x88f7, test_8021q_ipv4 ; ETH_P_1588 ?
+ * ldb [18] ; load payload
+ * and #0x8 ; as we don't have ports here, test
+ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
+ * ldh [18] ; reload payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0xc0 ; PTP_CLASS_VLAN|PTP_CLASS_L2
+ * ret a ; return PTP class
+ *
+ * ; PTP over UDP over IPv4 over 802.1Q over Ethernet
+ * test_8021q_ipv4:
+ * jneq #0x800, test_8021q_ipv6 ; ETH_P_IP ?
+ * ldb [27] ; load proto
+ * jneq #17, drop_8021q_ipv4 ; IPPROTO_UDP ?
+ * ldh [24] ; load frag offset field
+ * jset #0x1fff, drop_8021q_ipv4; don't allow fragments
+ * ldxb 4*([18]&0xf) ; load IP header len
+ * ldh [x + 20] ; load UDP dst port
+ * jneq #319, drop_8021q_ipv4 ; is port PTP_EV_PORT ?
+ * ldh [x + 26] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x90 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4
+ * ret a ; return PTP class
+ * drop_8021q_ipv4: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over UDP over IPv6 over 802.1Q over Ethernet
+ * test_8021q_ipv6:
+ * jneq #0x86dd, drop_8021q_ipv6 ; ETH_P_IPV6 ?
+ * ldb [24] ; load proto
+ * jneq #17, drop_8021q_ipv6 ; IPPROTO_UDP ?
+ * ldh [60] ; load UDP dst port
+ * jneq #319, drop_8021q_ipv6 ; is port PTP_EV_PORT ?
+ * ldh [66] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0xa0 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6
+ * ret a ; return PTP class
+ * drop_8021q_ipv6: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over Ethernet
+ * test_ieee1588:
+ * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ?
+ * ldb [14] ; load payload
+ * and #0x8 ; as we don't have ports here, test
+ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
+ * ldh [14] ; reload payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x40 ; PTP_CLASS_L2
+ * ret a ; return PTP class
+ * drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE
+ */
+
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+#include <linux/ptp_classify.h>
+
+static struct bpf_prog *ptp_insns __read_mostly;
+
+unsigned int ptp_classify_raw(const struct sk_buff *skb)
+{
+ return BPF_PROG_RUN(ptp_insns, skb);
+}
+EXPORT_SYMBOL_GPL(ptp_classify_raw);
+
+void __init ptp_classifier_init(void)
+{
+ static struct sock_filter ptp_filter[] __initdata = {
+ { 0x28, 0, 0, 0x0000000c },
+ { 0x15, 0, 12, 0x00000800 },
+ { 0x30, 0, 0, 0x00000017 },
+ { 0x15, 0, 9, 0x00000011 },
+ { 0x28, 0, 0, 0x00000014 },
+ { 0x45, 7, 0, 0x00001fff },
+ { 0xb1, 0, 0, 0x0000000e },
+ { 0x48, 0, 0, 0x00000010 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x48, 0, 0, 0x00000016 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000010 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 9, 0x000086dd },
+ { 0x30, 0, 0, 0x00000014 },
+ { 0x15, 0, 6, 0x00000011 },
+ { 0x28, 0, 0, 0x00000038 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x28, 0, 0, 0x0000003e },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000020 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 32, 0x00008100 },
+ { 0x28, 0, 0, 0x00000010 },
+ { 0x15, 0, 7, 0x000088f7 },
+ { 0x30, 0, 0, 0x00000012 },
+ { 0x54, 0, 0, 0x00000008 },
+ { 0x15, 0, 35, 0x00000000 },
+ { 0x28, 0, 0, 0x00000012 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x000000c0 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x15, 0, 12, 0x00000800 },
+ { 0x30, 0, 0, 0x0000001b },
+ { 0x15, 0, 9, 0x00000011 },
+ { 0x28, 0, 0, 0x00000018 },
+ { 0x45, 7, 0, 0x00001fff },
+ { 0xb1, 0, 0, 0x00000012 },
+ { 0x48, 0, 0, 0x00000014 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x48, 0, 0, 0x0000001a },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000090 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 8, 0x000086dd },
+ { 0x30, 0, 0, 0x00000018 },
+ { 0x15, 0, 6, 0x00000011 },
+ { 0x28, 0, 0, 0x0000003c },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x28, 0, 0, 0x00000042 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x000000a0 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 7, 0x000088f7 },
+ { 0x30, 0, 0, 0x0000000e },
+ { 0x54, 0, 0, 0x00000008 },
+ { 0x15, 0, 4, 0x00000000 },
+ { 0x28, 0, 0, 0x0000000e },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000040 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ };
+ struct sock_fprog_kern ptp_prog = {
+ .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter,
+ };
+
+ BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog));
+}
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
new file mode 100644
index 000000000..9b8727c67
--- /dev/null
+++ b/net/core/request_sock.c
@@ -0,0 +1,136 @@
+/*
+ * NET Generic infrastructure for Network protocols.
+ *
+ * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * From code originally in include/net/tcp.h
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/tcp.h>
+#include <linux/vmalloc.h>
+
+#include <net/request_sock.h>
+
+/*
+ * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
+ * One SYN_RECV socket costs about 80bytes on a 32bit machine.
+ * It would be better to replace it with a global counter for all sockets
+ * but then some measure against one socket starving all other sockets
+ * would be needed.
+ *
+ * The minimum value of it is 128. Experiments with real servers show that
+ * it is absolutely not enough even at 100conn/sec. 256 cures most
+ * of problems.
+ * This value is adjusted to 128 for low memory machines,
+ * and it will increase in proportion to the memory of machine.
+ * Note : Dont forget somaxconn that may limit backlog too.
+ */
+
+void reqsk_queue_alloc(struct request_sock_queue *queue)
+{
+ spin_lock_init(&queue->rskq_lock);
+
+ spin_lock_init(&queue->fastopenq.lock);
+ queue->fastopenq.rskq_rst_head = NULL;
+ queue->fastopenq.rskq_rst_tail = NULL;
+ queue->fastopenq.qlen = 0;
+
+ queue->rskq_accept_head = NULL;
+}
+
+/*
+ * This function is called to set a Fast Open socket's "fastopen_rsk" field
+ * to NULL when a TFO socket no longer needs to access the request_sock.
+ * This happens only after 3WHS has been either completed or aborted (e.g.,
+ * RST is received).
+ *
+ * Before TFO, a child socket is created only after 3WHS is completed,
+ * hence it never needs to access the request_sock. things get a lot more
+ * complex with TFO. A child socket, accepted or not, has to access its
+ * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
+ * until 3WHS is either completed or aborted. Afterwards the req will stay
+ * until either the child socket is accepted, or in the rare case when the
+ * listener is closed before the child is accepted.
+ *
+ * In short, a request socket is only freed after BOTH 3WHS has completed
+ * (or aborted) and the child socket has been accepted (or listener closed).
+ * When a child socket is accepted, its corresponding req->sk is set to
+ * NULL since it's no longer needed. More importantly, "req->sk == NULL"
+ * will be used by the code below to determine if a child socket has been
+ * accepted or not, and the check is protected by the fastopenq->lock
+ * described below.
+ *
+ * Note that fastopen_rsk is only accessed from the child socket's context
+ * with its socket lock held. But a request_sock (req) can be accessed by
+ * both its child socket through fastopen_rsk, and a listener socket through
+ * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
+ * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
+ * only in the rare case when both the listener and the child locks are held,
+ * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
+ * The lock also protects other fields such as fastopenq->qlen, which is
+ * decremented by this function when fastopen_rsk is no longer needed.
+ *
+ * Note that another solution was to simply use the existing socket lock
+ * from the listener. But first socket lock is difficult to use. It is not
+ * a simple spin lock - one must consider sock_owned_by_user() and arrange
+ * to use sk_add_backlog() stuff. But what really makes it infeasible is the
+ * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
+ * acquire a child's lock while holding listener's socket lock. A corner
+ * case might also exist in tcp_v4_hnd_req() that will trigger this locking
+ * order.
+ *
+ * This function also sets "treq->tfo_listener" to false.
+ * treq->tfo_listener is used by the listener so it is protected by the
+ * fastopenq->lock in this function.
+ */
+void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
+ bool reset)
+{
+ struct sock *lsk = req->rsk_listener;
+ struct fastopen_queue *fastopenq;
+
+ fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq;
+
+ tcp_sk(sk)->fastopen_rsk = NULL;
+ spin_lock_bh(&fastopenq->lock);
+ fastopenq->qlen--;
+ tcp_rsk(req)->tfo_listener = false;
+ if (req->sk) /* the child socket hasn't been accepted yet */
+ goto out;
+
+ if (!reset || lsk->sk_state != TCP_LISTEN) {
+ /* If the listener has been closed don't bother with the
+ * special RST handling below.
+ */
+ spin_unlock_bh(&fastopenq->lock);
+ reqsk_put(req);
+ return;
+ }
+ /* Wait for 60secs before removing a req that has triggered RST.
+ * This is a simple defense against TFO spoofing attack - by
+ * counting the req against fastopen.max_qlen, and disabling
+ * TFO when the qlen exceeds max_qlen.
+ *
+ * For more details see CoNext'11 "TCP Fast Open" paper.
+ */
+ req->rsk_timer.expires = jiffies + 60*HZ;
+ if (fastopenq->rskq_rst_head == NULL)
+ fastopenq->rskq_rst_head = req;
+ else
+ fastopenq->rskq_rst_tail->dl_next = req;
+
+ req->dl_next = NULL;
+ fastopenq->rskq_rst_tail = req;
+ fastopenq->qlen++;
+out:
+ spin_unlock_bh(&fastopenq->lock);
+}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
new file mode 100644
index 000000000..2837cc03f
--- /dev/null
+++ b/net/core/rtnetlink.c
@@ -0,0 +1,4911 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Routing netlink socket interface: protocol independent part.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Vitaly E. Lavrov RTA_OK arithmetics was wrong.
+ */
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/security.h>
+#include <linux/mutex.h>
+#include <linux/if_addr.h>
+#include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
+#include <linux/pci.h>
+#include <linux/etherdevice.h>
+#include <linux/bpf.h>
+
+#include <linux/uaccess.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/switchdev.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/fib_rules.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+
+#define RTNL_MAX_TYPE 48
+#define RTNL_SLAVE_MAX_TYPE 36
+
+struct rtnl_link {
+ rtnl_doit_func doit;
+ rtnl_dumpit_func dumpit;
+ struct module *owner;
+ unsigned int flags;
+ struct rcu_head rcu;
+};
+
+static DEFINE_MUTEX(rtnl_mutex);
+
+void rtnl_lock(void)
+{
+ mutex_lock(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_lock);
+
+int rtnl_lock_killable(void)
+{
+ return mutex_lock_killable(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_lock_killable);
+
+static struct sk_buff *defer_kfree_skb_list;
+void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
+{
+ if (head && tail) {
+ tail->next = defer_kfree_skb_list;
+ defer_kfree_skb_list = head;
+ }
+}
+EXPORT_SYMBOL(rtnl_kfree_skbs);
+
+void __rtnl_unlock(void)
+{
+ struct sk_buff *head = defer_kfree_skb_list;
+
+ defer_kfree_skb_list = NULL;
+
+ mutex_unlock(&rtnl_mutex);
+
+ while (head) {
+ struct sk_buff *next = head->next;
+
+ kfree_skb(head);
+ cond_resched();
+ head = next;
+ }
+}
+
+void rtnl_unlock(void)
+{
+ /* This fellow will unlock it for us. */
+ netdev_run_todo();
+}
+EXPORT_SYMBOL(rtnl_unlock);
+
+int rtnl_trylock(void)
+{
+ return mutex_trylock(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_trylock);
+
+int rtnl_is_locked(void)
+{
+ return mutex_is_locked(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_is_locked);
+
+bool refcount_dec_and_rtnl_lock(refcount_t *r)
+{
+ return refcount_dec_and_mutex_lock(r, &rtnl_mutex);
+}
+EXPORT_SYMBOL(refcount_dec_and_rtnl_lock);
+
+#ifdef CONFIG_PROVE_LOCKING
+bool lockdep_rtnl_is_held(void)
+{
+ return lockdep_is_held(&rtnl_mutex);
+}
+EXPORT_SYMBOL(lockdep_rtnl_is_held);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+
+static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
+
+static inline int rtm_msgindex(int msgtype)
+{
+ int msgindex = msgtype - RTM_BASE;
+
+ /*
+ * msgindex < 0 implies someone tried to register a netlink
+ * control code. msgindex >= RTM_NR_MSGTYPES may indicate that
+ * the message type has not been added to linux/rtnetlink.h
+ */
+ BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES);
+
+ return msgindex;
+}
+
+static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
+{
+ struct rtnl_link **tab;
+
+ if (protocol >= ARRAY_SIZE(rtnl_msg_handlers))
+ protocol = PF_UNSPEC;
+
+ tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]);
+ if (!tab)
+ tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]);
+
+ return tab[msgtype];
+}
+
+static int rtnl_register_internal(struct module *owner,
+ int protocol, int msgtype,
+ rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+ unsigned int flags)
+{
+ struct rtnl_link *link, *old;
+ struct rtnl_link __rcu **tab;
+ int msgindex;
+ int ret = -ENOBUFS;
+
+ BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
+ msgindex = rtm_msgindex(msgtype);
+
+ rtnl_lock();
+ tab = rtnl_msg_handlers[protocol];
+ if (tab == NULL) {
+ tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL);
+ if (!tab)
+ goto unlock;
+
+ /* ensures we see the 0 stores */
+ rcu_assign_pointer(rtnl_msg_handlers[protocol], tab);
+ }
+
+ old = rtnl_dereference(tab[msgindex]);
+ if (old) {
+ link = kmemdup(old, sizeof(*old), GFP_KERNEL);
+ if (!link)
+ goto unlock;
+ } else {
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link)
+ goto unlock;
+ }
+
+ WARN_ON(link->owner && link->owner != owner);
+ link->owner = owner;
+
+ WARN_ON(doit && link->doit && link->doit != doit);
+ if (doit)
+ link->doit = doit;
+ WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit);
+ if (dumpit)
+ link->dumpit = dumpit;
+
+ link->flags |= flags;
+
+ /* publish protocol:msgtype */
+ rcu_assign_pointer(tab[msgindex], link);
+ ret = 0;
+ if (old)
+ kfree_rcu(old, rcu);
+unlock:
+ rtnl_unlock();
+ return ret;
+}
+
+/**
+ * rtnl_register_module - Register a rtnetlink message type
+ *
+ * @owner: module registering the hook (THIS_MODULE)
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ * @doit: Function pointer called for each request message
+ * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
+ *
+ * Like rtnl_register, but for use by removable modules.
+ */
+int rtnl_register_module(struct module *owner,
+ int protocol, int msgtype,
+ rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+ unsigned int flags)
+{
+ return rtnl_register_internal(owner, protocol, msgtype,
+ doit, dumpit, flags);
+}
+EXPORT_SYMBOL_GPL(rtnl_register_module);
+
+/**
+ * rtnl_register - Register a rtnetlink message type
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ * @doit: Function pointer called for each request message
+ * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
+ *
+ * Registers the specified function pointers (at least one of them has
+ * to be non-NULL) to be called whenever a request message for the
+ * specified protocol family and message type is received.
+ *
+ * The special protocol family PF_UNSPEC may be used to define fallback
+ * function pointers for the case when no entry for the specific protocol
+ * family exists.
+ */
+void rtnl_register(int protocol, int msgtype,
+ rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+ unsigned int flags)
+{
+ int err;
+
+ err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit,
+ flags);
+ if (err)
+ pr_err("Unable to register rtnetlink message handler, "
+ "protocol = %d, message type = %d\n", protocol, msgtype);
+}
+
+/**
+ * rtnl_unregister - Unregister a rtnetlink message type
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int rtnl_unregister(int protocol, int msgtype)
+{
+ struct rtnl_link **tab, *link;
+ int msgindex;
+
+ BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
+ msgindex = rtm_msgindex(msgtype);
+
+ rtnl_lock();
+ tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
+ if (!tab) {
+ rtnl_unlock();
+ return -ENOENT;
+ }
+
+ link = tab[msgindex];
+ rcu_assign_pointer(tab[msgindex], NULL);
+ rtnl_unlock();
+
+ kfree_rcu(link, rcu);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rtnl_unregister);
+
+/**
+ * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol
+ * @protocol : Protocol family or PF_UNSPEC
+ *
+ * Identical to calling rtnl_unregster() for all registered message types
+ * of a certain protocol family.
+ */
+void rtnl_unregister_all(int protocol)
+{
+ struct rtnl_link **tab, *link;
+ int msgindex;
+
+ BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
+
+ rtnl_lock();
+ tab = rtnl_msg_handlers[protocol];
+ if (!tab) {
+ rtnl_unlock();
+ return;
+ }
+ RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL);
+ for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) {
+ link = tab[msgindex];
+ if (!link)
+ continue;
+
+ rcu_assign_pointer(tab[msgindex], NULL);
+ kfree_rcu(link, rcu);
+ }
+ rtnl_unlock();
+
+ synchronize_net();
+
+ kfree(tab);
+}
+EXPORT_SYMBOL_GPL(rtnl_unregister_all);
+
+static LIST_HEAD(link_ops);
+
+static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
+{
+ const struct rtnl_link_ops *ops;
+
+ list_for_each_entry(ops, &link_ops, list) {
+ if (!strcmp(ops->kind, kind))
+ return ops;
+ }
+ return NULL;
+}
+
+/**
+ * __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
+ * @ops: struct rtnl_link_ops * to register
+ *
+ * The caller must hold the rtnl_mutex. This function should be used
+ * by drivers that create devices during module initialization. It
+ * must be called before registering the devices.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int __rtnl_link_register(struct rtnl_link_ops *ops)
+{
+ if (rtnl_link_ops_get(ops->kind))
+ return -EEXIST;
+
+ /* The check for setup is here because if ops
+ * does not have that filled up, it is not possible
+ * to use the ops for creating device. So do not
+ * fill up dellink as well. That disables rtnl_dellink.
+ */
+ if (ops->setup && !ops->dellink)
+ ops->dellink = unregister_netdevice_queue;
+
+ list_add_tail(&ops->list, &link_ops);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__rtnl_link_register);
+
+/**
+ * rtnl_link_register - Register rtnl_link_ops with rtnetlink.
+ * @ops: struct rtnl_link_ops * to register
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int rtnl_link_register(struct rtnl_link_ops *ops)
+{
+ int err;
+
+ /* Sanity-check max sizes to avoid stack buffer overflow. */
+ if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE ||
+ ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE))
+ return -EINVAL;
+
+ rtnl_lock();
+ err = __rtnl_link_register(ops);
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL_GPL(rtnl_link_register);
+
+static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
+{
+ struct net_device *dev;
+ LIST_HEAD(list_kill);
+
+ for_each_netdev(net, dev) {
+ if (dev->rtnl_link_ops == ops)
+ ops->dellink(dev, &list_kill);
+ }
+ unregister_netdevice_many(&list_kill);
+}
+
+/**
+ * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
+ * @ops: struct rtnl_link_ops * to unregister
+ *
+ * The caller must hold the rtnl_mutex and guarantee net_namespace_list
+ * integrity (hold pernet_ops_rwsem for writing to close the race
+ * with setup_net() and cleanup_net()).
+ */
+void __rtnl_link_unregister(struct rtnl_link_ops *ops)
+{
+ struct net *net;
+
+ for_each_net(net) {
+ __rtnl_kill_links(net, ops);
+ }
+ list_del(&ops->list);
+}
+EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
+
+/* Return with the rtnl_lock held when there are no network
+ * devices unregistering in any network namespace.
+ */
+static void rtnl_lock_unregistering_all(void)
+{
+ struct net *net;
+ bool unregistering;
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(&netdev_unregistering_wq, &wait);
+ for (;;) {
+ unregistering = false;
+ rtnl_lock();
+ /* We held write locked pernet_ops_rwsem, and parallel
+ * setup_net() and cleanup_net() are not possible.
+ */
+ for_each_net(net) {
+ if (net->dev_unreg_count > 0) {
+ unregistering = true;
+ break;
+ }
+ }
+ if (!unregistering)
+ break;
+ __rtnl_unlock();
+
+ wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+ }
+ remove_wait_queue(&netdev_unregistering_wq, &wait);
+}
+
+/**
+ * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
+ * @ops: struct rtnl_link_ops * to unregister
+ */
+void rtnl_link_unregister(struct rtnl_link_ops *ops)
+{
+ /* Close the race with setup_net() and cleanup_net() */
+ down_write(&pernet_ops_rwsem);
+ rtnl_lock_unregistering_all();
+ __rtnl_link_unregister(ops);
+ rtnl_unlock();
+ up_write(&pernet_ops_rwsem);
+}
+EXPORT_SYMBOL_GPL(rtnl_link_unregister);
+
+static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev)
+{
+ struct net_device *master_dev;
+ const struct rtnl_link_ops *ops;
+ size_t size = 0;
+
+ rcu_read_lock();
+
+ master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev);
+ if (!master_dev)
+ goto out;
+
+ ops = master_dev->rtnl_link_ops;
+ if (!ops || !ops->get_slave_size)
+ goto out;
+ /* IFLA_INFO_SLAVE_DATA + nested data */
+ size = nla_total_size(sizeof(struct nlattr)) +
+ ops->get_slave_size(master_dev, dev);
+
+out:
+ rcu_read_unlock();
+ return size;
+}
+
+static size_t rtnl_link_get_size(const struct net_device *dev)
+{
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+ size_t size;
+
+ if (!ops)
+ return 0;
+
+ size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */
+ nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */
+
+ if (ops->get_size)
+ /* IFLA_INFO_DATA + nested data */
+ size += nla_total_size(sizeof(struct nlattr)) +
+ ops->get_size(dev);
+
+ if (ops->get_xstats_size)
+ /* IFLA_INFO_XSTATS */
+ size += nla_total_size(ops->get_xstats_size(dev));
+
+ size += rtnl_link_get_slave_info_data_size(dev);
+
+ return size;
+}
+
+static LIST_HEAD(rtnl_af_ops);
+
+static const struct rtnl_af_ops *rtnl_af_lookup(const int family)
+{
+ const struct rtnl_af_ops *ops;
+
+ list_for_each_entry_rcu(ops, &rtnl_af_ops, list) {
+ if (ops->family == family)
+ return ops;
+ }
+
+ return NULL;
+}
+
+/**
+ * rtnl_af_register - Register rtnl_af_ops with rtnetlink.
+ * @ops: struct rtnl_af_ops * to register
+ *
+ * Returns 0 on success or a negative error code.
+ */
+void rtnl_af_register(struct rtnl_af_ops *ops)
+{
+ rtnl_lock();
+ list_add_tail_rcu(&ops->list, &rtnl_af_ops);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(rtnl_af_register);
+
+/**
+ * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
+ * @ops: struct rtnl_af_ops * to unregister
+ */
+void rtnl_af_unregister(struct rtnl_af_ops *ops)
+{
+ rtnl_lock();
+ list_del_rcu(&ops->list);
+ rtnl_unlock();
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(rtnl_af_unregister);
+
+static size_t rtnl_link_get_af_size(const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ struct rtnl_af_ops *af_ops;
+ size_t size;
+
+ /* IFLA_AF_SPEC */
+ size = nla_total_size(sizeof(struct nlattr));
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
+ if (af_ops->get_link_af_size) {
+ /* AF_* + nested data */
+ size += nla_total_size(sizeof(struct nlattr)) +
+ af_ops->get_link_af_size(dev, ext_filter_mask);
+ }
+ }
+ rcu_read_unlock();
+
+ return size;
+}
+
+static bool rtnl_have_link_slave_info(const struct net_device *dev)
+{
+ struct net_device *master_dev;
+ bool ret = false;
+
+ rcu_read_lock();
+
+ master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev);
+ if (master_dev && master_dev->rtnl_link_ops)
+ ret = true;
+ rcu_read_unlock();
+ return ret;
+}
+
+static int rtnl_link_slave_info_fill(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct net_device *master_dev;
+ const struct rtnl_link_ops *ops;
+ struct nlattr *slave_data;
+ int err;
+
+ master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
+ if (!master_dev)
+ return 0;
+ ops = master_dev->rtnl_link_ops;
+ if (!ops)
+ return 0;
+ if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0)
+ return -EMSGSIZE;
+ if (ops->fill_slave_info) {
+ slave_data = nla_nest_start(skb, IFLA_INFO_SLAVE_DATA);
+ if (!slave_data)
+ return -EMSGSIZE;
+ err = ops->fill_slave_info(skb, master_dev, dev);
+ if (err < 0)
+ goto err_cancel_slave_data;
+ nla_nest_end(skb, slave_data);
+ }
+ return 0;
+
+err_cancel_slave_data:
+ nla_nest_cancel(skb, slave_data);
+ return err;
+}
+
+static int rtnl_link_info_fill(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+ struct nlattr *data;
+ int err;
+
+ if (!ops)
+ return 0;
+ if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0)
+ return -EMSGSIZE;
+ if (ops->fill_xstats) {
+ err = ops->fill_xstats(skb, dev);
+ if (err < 0)
+ return err;
+ }
+ if (ops->fill_info) {
+ data = nla_nest_start(skb, IFLA_INFO_DATA);
+ if (data == NULL)
+ return -EMSGSIZE;
+ err = ops->fill_info(skb, dev);
+ if (err < 0)
+ goto err_cancel_data;
+ nla_nest_end(skb, data);
+ }
+ return 0;
+
+err_cancel_data:
+ nla_nest_cancel(skb, data);
+ return err;
+}
+
+static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct nlattr *linkinfo;
+ int err = -EMSGSIZE;
+
+ linkinfo = nla_nest_start(skb, IFLA_LINKINFO);
+ if (linkinfo == NULL)
+ goto out;
+
+ err = rtnl_link_info_fill(skb, dev);
+ if (err < 0)
+ goto err_cancel_link;
+
+ err = rtnl_link_slave_info_fill(skb, dev);
+ if (err < 0)
+ goto err_cancel_link;
+
+ nla_nest_end(skb, linkinfo);
+ return 0;
+
+err_cancel_link:
+ nla_nest_cancel(skb, linkinfo);
+out:
+ return err;
+}
+
+int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo)
+{
+ struct sock *rtnl = net->rtnl;
+ int err = 0;
+
+ NETLINK_CB(skb).dst_group = group;
+ if (echo)
+ refcount_inc(&skb->users);
+ netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
+ if (echo)
+ err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
+ return err;
+}
+
+int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
+{
+ struct sock *rtnl = net->rtnl;
+
+ return nlmsg_unicast(rtnl, skb, pid);
+}
+EXPORT_SYMBOL(rtnl_unicast);
+
+void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
+ struct nlmsghdr *nlh, gfp_t flags)
+{
+ struct sock *rtnl = net->rtnl;
+ int report = 0;
+
+ if (nlh)
+ report = nlmsg_report(nlh);
+
+ nlmsg_notify(rtnl, skb, pid, group, report, flags);
+}
+EXPORT_SYMBOL(rtnl_notify);
+
+void rtnl_set_sk_err(struct net *net, u32 group, int error)
+{
+ struct sock *rtnl = net->rtnl;
+
+ netlink_set_err(rtnl, 0, group, error);
+}
+EXPORT_SYMBOL(rtnl_set_sk_err);
+
+int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
+{
+ struct nlattr *mx;
+ int i, valid = 0;
+
+ mx = nla_nest_start(skb, RTA_METRICS);
+ if (mx == NULL)
+ return -ENOBUFS;
+
+ for (i = 0; i < RTAX_MAX; i++) {
+ if (metrics[i]) {
+ if (i == RTAX_CC_ALGO - 1) {
+ char tmp[TCP_CA_NAME_MAX], *name;
+
+ name = tcp_ca_get_name_by_key(metrics[i], tmp);
+ if (!name)
+ continue;
+ if (nla_put_string(skb, i + 1, name))
+ goto nla_put_failure;
+ } else if (i == RTAX_FEATURES - 1) {
+ u32 user_features = metrics[i] & RTAX_FEATURE_MASK;
+
+ if (!user_features)
+ continue;
+ BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK);
+ if (nla_put_u32(skb, i + 1, user_features))
+ goto nla_put_failure;
+ } else {
+ if (nla_put_u32(skb, i + 1, metrics[i]))
+ goto nla_put_failure;
+ }
+ valid++;
+ }
+ }
+
+ if (!valid) {
+ nla_nest_cancel(skb, mx);
+ return 0;
+ }
+
+ return nla_nest_end(skb, mx);
+
+nla_put_failure:
+ nla_nest_cancel(skb, mx);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL(rtnetlink_put_metrics);
+
+int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
+ long expires, u32 error)
+{
+ struct rta_cacheinfo ci = {
+ .rta_error = error,
+ .rta_id = id,
+ };
+
+ if (dst) {
+ ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
+ ci.rta_used = dst->__use;
+ ci.rta_clntref = atomic_read(&dst->__refcnt);
+ }
+ if (expires) {
+ unsigned long clock;
+
+ clock = jiffies_to_clock_t(abs(expires));
+ clock = min_t(unsigned long, clock, INT_MAX);
+ ci.rta_expires = (expires > 0) ? clock : -clock;
+ }
+ return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci);
+}
+EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
+
+static void set_operstate(struct net_device *dev, unsigned char transition)
+{
+ unsigned char operstate = dev->operstate;
+
+ switch (transition) {
+ case IF_OPER_UP:
+ if ((operstate == IF_OPER_DORMANT ||
+ operstate == IF_OPER_UNKNOWN) &&
+ !netif_dormant(dev))
+ operstate = IF_OPER_UP;
+ break;
+
+ case IF_OPER_DORMANT:
+ if (operstate == IF_OPER_UP ||
+ operstate == IF_OPER_UNKNOWN)
+ operstate = IF_OPER_DORMANT;
+ break;
+ }
+
+ if (dev->operstate != operstate) {
+ write_lock_bh(&dev_base_lock);
+ dev->operstate = operstate;
+ write_unlock_bh(&dev_base_lock);
+ netdev_state_change(dev);
+ }
+}
+
+static unsigned int rtnl_dev_get_flags(const struct net_device *dev)
+{
+ return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) |
+ (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI));
+}
+
+static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,
+ const struct ifinfomsg *ifm)
+{
+ unsigned int flags = ifm->ifi_flags;
+
+ /* bugwards compatibility: ifi_change == 0 is treated as ~0 */
+ if (ifm->ifi_change)
+ flags = (flags & ifm->ifi_change) |
+ (rtnl_dev_get_flags(dev) & ~ifm->ifi_change);
+
+ return flags;
+}
+
+static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
+ const struct rtnl_link_stats64 *b)
+{
+ a->rx_packets = b->rx_packets;
+ a->tx_packets = b->tx_packets;
+ a->rx_bytes = b->rx_bytes;
+ a->tx_bytes = b->tx_bytes;
+ a->rx_errors = b->rx_errors;
+ a->tx_errors = b->tx_errors;
+ a->rx_dropped = b->rx_dropped;
+ a->tx_dropped = b->tx_dropped;
+
+ a->multicast = b->multicast;
+ a->collisions = b->collisions;
+
+ a->rx_length_errors = b->rx_length_errors;
+ a->rx_over_errors = b->rx_over_errors;
+ a->rx_crc_errors = b->rx_crc_errors;
+ a->rx_frame_errors = b->rx_frame_errors;
+ a->rx_fifo_errors = b->rx_fifo_errors;
+ a->rx_missed_errors = b->rx_missed_errors;
+
+ a->tx_aborted_errors = b->tx_aborted_errors;
+ a->tx_carrier_errors = b->tx_carrier_errors;
+ a->tx_fifo_errors = b->tx_fifo_errors;
+ a->tx_heartbeat_errors = b->tx_heartbeat_errors;
+ a->tx_window_errors = b->tx_window_errors;
+
+ a->rx_compressed = b->rx_compressed;
+ a->tx_compressed = b->tx_compressed;
+
+ a->rx_nohandler = b->rx_nohandler;
+}
+
+/* All VF info */
+static inline int rtnl_vfinfo_size(const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) {
+ int num_vfs = dev_num_vf(dev->dev.parent);
+ size_t size = nla_total_size(0);
+ size += num_vfs *
+ (nla_total_size(0) +
+ nla_total_size(sizeof(struct ifla_vf_mac)) +
+ nla_total_size(sizeof(struct ifla_vf_vlan)) +
+ nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */
+ nla_total_size(MAX_VLAN_LIST_LEN *
+ sizeof(struct ifla_vf_vlan_info)) +
+ nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
+ nla_total_size(sizeof(struct ifla_vf_tx_rate)) +
+ nla_total_size(sizeof(struct ifla_vf_rate)) +
+ nla_total_size(sizeof(struct ifla_vf_link_state)) +
+ nla_total_size(sizeof(struct ifla_vf_rss_query_en)) +
+ nla_total_size(0) + /* nest IFLA_VF_STATS */
+ /* IFLA_VF_STATS_RX_PACKETS */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_PACKETS */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_RX_BYTES */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_BYTES */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_BROADCAST */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_MULTICAST */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_RX_DROPPED */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_DROPPED */
+ nla_total_size_64bit(sizeof(__u64)) +
+ nla_total_size(sizeof(struct ifla_vf_trust)));
+ return size;
+ } else
+ return 0;
+}
+
+static size_t rtnl_port_size(const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ size_t port_size = nla_total_size(4) /* PORT_VF */
+ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */
+ + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */
+ + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */
+ + nla_total_size(1) /* PROT_VDP_REQUEST */
+ + nla_total_size(2); /* PORT_VDP_RESPONSE */
+ size_t vf_ports_size = nla_total_size(sizeof(struct nlattr));
+ size_t vf_port_size = nla_total_size(sizeof(struct nlattr))
+ + port_size;
+ size_t port_self_size = nla_total_size(sizeof(struct nlattr))
+ + port_size;
+
+ if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
+ !(ext_filter_mask & RTEXT_FILTER_VF))
+ return 0;
+ if (dev_num_vf(dev->dev.parent))
+ return port_self_size + vf_ports_size +
+ vf_port_size * dev_num_vf(dev->dev.parent);
+ else
+ return port_self_size;
+}
+
+static size_t rtnl_xdp_size(void)
+{
+ size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */
+ nla_total_size(1) + /* XDP_ATTACHED */
+ nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */
+ nla_total_size(4); /* XDP_<mode>_PROG_ID */
+
+ return xdp_size;
+}
+
+static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */
+ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
+ + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap))
+ + nla_total_size(sizeof(struct rtnl_link_stats))
+ + nla_total_size_64bit(sizeof(struct rtnl_link_stats64))
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */
+ + nla_total_size(4) /* IFLA_TXQLEN */
+ + nla_total_size(4) /* IFLA_WEIGHT */
+ + nla_total_size(4) /* IFLA_MTU */
+ + nla_total_size(4) /* IFLA_LINK */
+ + nla_total_size(4) /* IFLA_MASTER */
+ + nla_total_size(1) /* IFLA_CARRIER */
+ + nla_total_size(4) /* IFLA_PROMISCUITY */
+ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
+ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
+ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */
+ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */
+ + nla_total_size(1) /* IFLA_OPERSTATE */
+ + nla_total_size(1) /* IFLA_LINKMODE */
+ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+ + nla_total_size(4) /* IFLA_LINK_NETNSID */
+ + nla_total_size(4) /* IFLA_GROUP */
+ + nla_total_size(ext_filter_mask
+ & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
+ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
+ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
+ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
+ + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */
+ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
+ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
+ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
+ + rtnl_xdp_size() /* IFLA_XDP */
+ + nla_total_size(4) /* IFLA_EVENT */
+ + nla_total_size(4) /* IFLA_NEW_NETNSID */
+ + nla_total_size(4) /* IFLA_NEW_IFINDEX */
+ + nla_total_size(1) /* IFLA_PROTO_DOWN */
+ + nla_total_size(4) /* IFLA_IF_NETNSID */
+ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */
+ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */
+ + nla_total_size(4) /* IFLA_MIN_MTU */
+ + nla_total_size(4) /* IFLA_MAX_MTU */
+ + 0;
+}
+
+static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ struct nlattr *vf_ports;
+ struct nlattr *vf_port;
+ int vf;
+ int err;
+
+ vf_ports = nla_nest_start(skb, IFLA_VF_PORTS);
+ if (!vf_ports)
+ return -EMSGSIZE;
+
+ for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) {
+ vf_port = nla_nest_start(skb, IFLA_VF_PORT);
+ if (!vf_port)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_PORT_VF, vf))
+ goto nla_put_failure;
+ err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb);
+ if (err == -EMSGSIZE)
+ goto nla_put_failure;
+ if (err) {
+ nla_nest_cancel(skb, vf_port);
+ continue;
+ }
+ nla_nest_end(skb, vf_port);
+ }
+
+ nla_nest_end(skb, vf_ports);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, vf_ports);
+ return -EMSGSIZE;
+}
+
+static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ struct nlattr *port_self;
+ int err;
+
+ port_self = nla_nest_start(skb, IFLA_PORT_SELF);
+ if (!port_self)
+ return -EMSGSIZE;
+
+ err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb);
+ if (err) {
+ nla_nest_cancel(skb, port_self);
+ return (err == -EMSGSIZE) ? err : 0;
+ }
+
+ nla_nest_end(skb, port_self);
+
+ return 0;
+}
+
+static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ int err;
+
+ if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
+ !(ext_filter_mask & RTEXT_FILTER_VF))
+ return 0;
+
+ err = rtnl_port_self_fill(skb, dev);
+ if (err)
+ return err;
+
+ if (dev_num_vf(dev->dev.parent)) {
+ err = rtnl_vf_ports_fill(skb, dev);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ int err;
+ struct netdev_phys_item_id ppid;
+
+ err = dev_get_phys_port_id(dev, &ppid);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ char name[IFNAMSIZ];
+ int err;
+
+ err = dev_get_phys_port_name(dev, name, sizeof(name));
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ int err;
+ struct switchdev_attr attr = {
+ .orig_dev = dev,
+ .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
+ .flags = SWITCHDEV_F_NO_RECURSE,
+ };
+
+ err = switchdev_port_attr_get(dev, &attr);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.u.ppid.id_len,
+ attr.u.ppid.id))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct rtnl_link_stats64 *sp;
+ struct nlattr *attr;
+
+ attr = nla_reserve_64bit(skb, IFLA_STATS64,
+ sizeof(struct rtnl_link_stats64), IFLA_PAD);
+ if (!attr)
+ return -EMSGSIZE;
+
+ sp = nla_data(attr);
+ dev_get_stats(dev, sp);
+
+ attr = nla_reserve(skb, IFLA_STATS,
+ sizeof(struct rtnl_link_stats));
+ if (!attr)
+ return -EMSGSIZE;
+
+ copy_rtnl_link_stats(nla_data(attr), sp);
+
+ return 0;
+}
+
+static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
+ struct net_device *dev,
+ int vfs_num,
+ struct nlattr *vfinfo)
+{
+ struct ifla_vf_rss_query_en vf_rss_query_en;
+ struct nlattr *vf, *vfstats, *vfvlanlist;
+ struct ifla_vf_link_state vf_linkstate;
+ struct ifla_vf_vlan_info vf_vlan_info;
+ struct ifla_vf_spoofchk vf_spoofchk;
+ struct ifla_vf_tx_rate vf_tx_rate;
+ struct ifla_vf_stats vf_stats;
+ struct ifla_vf_trust vf_trust;
+ struct ifla_vf_vlan vf_vlan;
+ struct ifla_vf_rate vf_rate;
+ struct ifla_vf_mac vf_mac;
+ struct ifla_vf_info ivi;
+
+ memset(&ivi, 0, sizeof(ivi));
+
+ /* Not all SR-IOV capable drivers support the
+ * spoofcheck and "RSS query enable" query. Preset to
+ * -1 so the user space tool can detect that the driver
+ * didn't report anything.
+ */
+ ivi.spoofchk = -1;
+ ivi.rss_query_en = -1;
+ ivi.trusted = -1;
+ /* The default value for VF link state is "auto"
+ * IFLA_VF_LINK_STATE_AUTO which equals zero
+ */
+ ivi.linkstate = 0;
+ /* VLAN Protocol by default is 802.1Q */
+ ivi.vlan_proto = htons(ETH_P_8021Q);
+ if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
+ return 0;
+
+ memset(&vf_vlan_info, 0, sizeof(vf_vlan_info));
+
+ vf_mac.vf =
+ vf_vlan.vf =
+ vf_vlan_info.vf =
+ vf_rate.vf =
+ vf_tx_rate.vf =
+ vf_spoofchk.vf =
+ vf_linkstate.vf =
+ vf_rss_query_en.vf =
+ vf_trust.vf = ivi.vf;
+
+ memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+ vf_vlan.vlan = ivi.vlan;
+ vf_vlan.qos = ivi.qos;
+ vf_vlan_info.vlan = ivi.vlan;
+ vf_vlan_info.qos = ivi.qos;
+ vf_vlan_info.vlan_proto = ivi.vlan_proto;
+ vf_tx_rate.rate = ivi.max_tx_rate;
+ vf_rate.min_tx_rate = ivi.min_tx_rate;
+ vf_rate.max_tx_rate = ivi.max_tx_rate;
+ vf_spoofchk.setting = ivi.spoofchk;
+ vf_linkstate.link_state = ivi.linkstate;
+ vf_rss_query_en.setting = ivi.rss_query_en;
+ vf_trust.setting = ivi.trusted;
+ vf = nla_nest_start(skb, IFLA_VF_INFO);
+ if (!vf)
+ goto nla_put_vfinfo_failure;
+ if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
+ nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
+ nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
+ &vf_rate) ||
+ nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
+ &vf_tx_rate) ||
+ nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
+ &vf_spoofchk) ||
+ nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate),
+ &vf_linkstate) ||
+ nla_put(skb, IFLA_VF_RSS_QUERY_EN,
+ sizeof(vf_rss_query_en),
+ &vf_rss_query_en) ||
+ nla_put(skb, IFLA_VF_TRUST,
+ sizeof(vf_trust), &vf_trust))
+ goto nla_put_vf_failure;
+ vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST);
+ if (!vfvlanlist)
+ goto nla_put_vf_failure;
+ if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info),
+ &vf_vlan_info)) {
+ nla_nest_cancel(skb, vfvlanlist);
+ goto nla_put_vf_failure;
+ }
+ nla_nest_end(skb, vfvlanlist);
+ memset(&vf_stats, 0, sizeof(vf_stats));
+ if (dev->netdev_ops->ndo_get_vf_stats)
+ dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
+ &vf_stats);
+ vfstats = nla_nest_start(skb, IFLA_VF_STATS);
+ if (!vfstats)
+ goto nla_put_vf_failure;
+ if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
+ vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
+ vf_stats.tx_packets, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES,
+ vf_stats.rx_bytes, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES,
+ vf_stats.tx_bytes, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
+ vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
+ vf_stats.multicast, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED,
+ vf_stats.rx_dropped, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED,
+ vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) {
+ nla_nest_cancel(skb, vfstats);
+ goto nla_put_vf_failure;
+ }
+ nla_nest_end(skb, vfstats);
+ nla_nest_end(skb, vf);
+ return 0;
+
+nla_put_vf_failure:
+ nla_nest_cancel(skb, vf);
+nla_put_vfinfo_failure:
+ nla_nest_cancel(skb, vfinfo);
+ return -EMSGSIZE;
+}
+
+static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb,
+ struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ struct nlattr *vfinfo;
+ int i, num_vfs;
+
+ if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0))
+ return 0;
+
+ num_vfs = dev_num_vf(dev->dev.parent);
+ if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs))
+ return -EMSGSIZE;
+
+ if (!dev->netdev_ops->ndo_get_vf_config)
+ return 0;
+
+ vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
+ if (!vfinfo)
+ return -EMSGSIZE;
+
+ for (i = 0; i < num_vfs; i++) {
+ if (rtnl_fill_vfinfo(skb, dev, i, vfinfo))
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, vfinfo);
+ return 0;
+}
+
+static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
+{
+ struct rtnl_link_ifmap map;
+
+ memset(&map, 0, sizeof(map));
+ map.mem_start = dev->mem_start;
+ map.mem_end = dev->mem_end;
+ map.base_addr = dev->base_addr;
+ map.irq = dev->irq;
+ map.dma = dev->dma;
+ map.port = dev->if_port;
+
+ if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static u32 rtnl_xdp_prog_skb(struct net_device *dev)
+{
+ const struct bpf_prog *generic_xdp_prog;
+
+ ASSERT_RTNL();
+
+ generic_xdp_prog = rtnl_dereference(dev->xdp_prog);
+ if (!generic_xdp_prog)
+ return 0;
+ return generic_xdp_prog->aux->id;
+}
+
+static u32 rtnl_xdp_prog_drv(struct net_device *dev)
+{
+ return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);
+}
+
+static u32 rtnl_xdp_prog_hw(struct net_device *dev)
+{
+ return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf,
+ XDP_QUERY_PROG_HW);
+}
+
+static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev,
+ u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr,
+ u32 (*get_prog_id)(struct net_device *dev))
+{
+ u32 curr_id;
+ int err;
+
+ curr_id = get_prog_id(dev);
+ if (!curr_id)
+ return 0;
+
+ *prog_id = curr_id;
+ err = nla_put_u32(skb, attr, curr_id);
+ if (err)
+ return err;
+
+ if (*mode != XDP_ATTACHED_NONE)
+ *mode = XDP_ATTACHED_MULTI;
+ else
+ *mode = tgt_mode;
+
+ return 0;
+}
+
+static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ struct nlattr *xdp;
+ u32 prog_id;
+ int err;
+ u8 mode;
+
+ xdp = nla_nest_start(skb, IFLA_XDP);
+ if (!xdp)
+ return -EMSGSIZE;
+
+ prog_id = 0;
+ mode = XDP_ATTACHED_NONE;
+ err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB,
+ IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb);
+ if (err)
+ goto err_cancel;
+ err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV,
+ IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv);
+ if (err)
+ goto err_cancel;
+ err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW,
+ IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw);
+ if (err)
+ goto err_cancel;
+
+ err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode);
+ if (err)
+ goto err_cancel;
+
+ if (prog_id && mode != XDP_ATTACHED_MULTI) {
+ err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id);
+ if (err)
+ goto err_cancel;
+ }
+
+ nla_nest_end(skb, xdp);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, xdp);
+ return err;
+}
+
+static u32 rtnl_get_event(unsigned long event)
+{
+ u32 rtnl_event_type = IFLA_EVENT_NONE;
+
+ switch (event) {
+ case NETDEV_REBOOT:
+ rtnl_event_type = IFLA_EVENT_REBOOT;
+ break;
+ case NETDEV_FEAT_CHANGE:
+ rtnl_event_type = IFLA_EVENT_FEATURES;
+ break;
+ case NETDEV_BONDING_FAILOVER:
+ rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER;
+ break;
+ case NETDEV_NOTIFY_PEERS:
+ rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS;
+ break;
+ case NETDEV_RESEND_IGMP:
+ rtnl_event_type = IFLA_EVENT_IGMP_RESEND;
+ break;
+ case NETDEV_CHANGEINFODATA:
+ rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS;
+ break;
+ default:
+ break;
+ }
+
+ return rtnl_event_type;
+}
+
+static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev)
+{
+ const struct net_device *upper_dev;
+ int ret = 0;
+
+ rcu_read_lock();
+
+ upper_dev = netdev_master_upper_dev_get_rcu(dev);
+ if (upper_dev)
+ ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex);
+
+ rcu_read_unlock();
+ return ret;
+}
+
+static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev,
+ bool force)
+{
+ int ifindex = dev_get_iflink(dev);
+
+ if (force || dev->ifindex != ifindex)
+ return nla_put_u32(skb, IFLA_LINK, ifindex);
+
+ return 0;
+}
+
+static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ char buf[IFALIASZ];
+ int ret;
+
+ ret = dev_get_alias(dev, buf, sizeof(buf));
+ return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0;
+}
+
+static int rtnl_fill_link_netnsid(struct sk_buff *skb,
+ const struct net_device *dev,
+ struct net *src_net, gfp_t gfp)
+{
+ bool put_iflink = false;
+
+ if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) {
+ struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
+
+ if (!net_eq(dev_net(dev), link_net)) {
+ int id = peernet2id_alloc(src_net, link_net, gfp);
+
+ if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
+ return -EMSGSIZE;
+
+ put_iflink = true;
+ }
+ }
+
+ return nla_put_iflink(skb, dev, put_iflink);
+}
+
+static int rtnl_fill_link_af(struct sk_buff *skb,
+ const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ const struct rtnl_af_ops *af_ops;
+ struct nlattr *af_spec;
+
+ af_spec = nla_nest_start(skb, IFLA_AF_SPEC);
+ if (!af_spec)
+ return -EMSGSIZE;
+
+ list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
+ struct nlattr *af;
+ int err;
+
+ if (!af_ops->fill_link_af)
+ continue;
+
+ af = nla_nest_start(skb, af_ops->family);
+ if (!af)
+ return -EMSGSIZE;
+
+ err = af_ops->fill_link_af(skb, dev, ext_filter_mask);
+ /*
+ * Caller may return ENODATA to indicate that there
+ * was no data to be dumped. This is not an error, it
+ * means we should trim the attribute header and
+ * continue.
+ */
+ if (err == -ENODATA)
+ nla_nest_cancel(skb, af);
+ else if (err < 0)
+ return -EMSGSIZE;
+
+ nla_nest_end(skb, af);
+ }
+
+ nla_nest_end(skb, af_spec);
+ return 0;
+}
+
+static int rtnl_fill_ifinfo(struct sk_buff *skb,
+ struct net_device *dev, struct net *src_net,
+ int type, u32 pid, u32 seq, u32 change,
+ unsigned int flags, u32 ext_filter_mask,
+ u32 event, int *new_nsid, int new_ifindex,
+ int tgt_netnsid, gfp_t gfp)
+{
+ struct ifinfomsg *ifm;
+ struct nlmsghdr *nlh;
+
+ ASSERT_RTNL();
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifi_family = AF_UNSPEC;
+ ifm->__ifi_pad = 0;
+ ifm->ifi_type = dev->type;
+ ifm->ifi_index = dev->ifindex;
+ ifm->ifi_flags = dev_get_flags(dev);
+ ifm->ifi_change = change;
+
+ if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_IF_NETNSID, tgt_netnsid))
+ goto nla_put_failure;
+
+ if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
+ nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) ||
+ nla_put_u8(skb, IFLA_OPERSTATE,
+ netif_running(dev) ? dev->operstate : IF_OPER_DOWN) ||
+ nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) ||
+ nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+ nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) ||
+ nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) ||
+ nla_put_u32(skb, IFLA_GROUP, dev->group) ||
+ nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
+ nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) ||
+#ifdef CONFIG_RPS
+ nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
+#endif
+ put_master_ifindex(skb, dev) ||
+ nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
+ (dev->qdisc &&
+ nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
+ nla_put_ifalias(skb, dev) ||
+ nla_put_u32(skb, IFLA_CARRIER_CHANGES,
+ atomic_read(&dev->carrier_up_count) +
+ atomic_read(&dev->carrier_down_count)) ||
+ nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) ||
+ nla_put_u32(skb, IFLA_CARRIER_UP_COUNT,
+ atomic_read(&dev->carrier_up_count)) ||
+ nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT,
+ atomic_read(&dev->carrier_down_count)))
+ goto nla_put_failure;
+
+ if (event != IFLA_EVENT_NONE) {
+ if (nla_put_u32(skb, IFLA_EVENT, event))
+ goto nla_put_failure;
+ }
+
+ if (rtnl_fill_link_ifmap(skb, dev))
+ goto nla_put_failure;
+
+ if (dev->addr_len) {
+ if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) ||
+ nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast))
+ goto nla_put_failure;
+ }
+
+ if (rtnl_phys_port_id_fill(skb, dev))
+ goto nla_put_failure;
+
+ if (rtnl_phys_port_name_fill(skb, dev))
+ goto nla_put_failure;
+
+ if (rtnl_phys_switch_id_fill(skb, dev))
+ goto nla_put_failure;
+
+ if (rtnl_fill_stats(skb, dev))
+ goto nla_put_failure;
+
+ if (rtnl_fill_vf(skb, dev, ext_filter_mask))
+ goto nla_put_failure;
+
+ if (rtnl_port_fill(skb, dev, ext_filter_mask))
+ goto nla_put_failure;
+
+ if (rtnl_xdp_fill(skb, dev))
+ goto nla_put_failure;
+
+ if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) {
+ if (rtnl_link_fill(skb, dev) < 0)
+ goto nla_put_failure;
+ }
+
+ if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp))
+ goto nla_put_failure;
+
+ if (new_nsid &&
+ nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)
+ goto nla_put_failure;
+ if (new_ifindex &&
+ nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0)
+ goto nla_put_failure;
+
+
+ rcu_read_lock();
+ if (rtnl_fill_link_af(skb, dev, ext_filter_mask))
+ goto nla_put_failure_rcu;
+ rcu_read_unlock();
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure_rcu:
+ rcu_read_unlock();
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
+ [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
+ [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) },
+ [IFLA_MTU] = { .type = NLA_U32 },
+ [IFLA_LINK] = { .type = NLA_U32 },
+ [IFLA_MASTER] = { .type = NLA_U32 },
+ [IFLA_CARRIER] = { .type = NLA_U8 },
+ [IFLA_TXQLEN] = { .type = NLA_U32 },
+ [IFLA_WEIGHT] = { .type = NLA_U32 },
+ [IFLA_OPERSTATE] = { .type = NLA_U8 },
+ [IFLA_LINKMODE] = { .type = NLA_U8 },
+ [IFLA_LINKINFO] = { .type = NLA_NESTED },
+ [IFLA_NET_NS_PID] = { .type = NLA_U32 },
+ [IFLA_NET_NS_FD] = { .type = NLA_U32 },
+ /* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to
+ * allow 0-length string (needed to remove an alias).
+ */
+ [IFLA_IFALIAS] = { .type = NLA_BINARY, .len = IFALIASZ - 1 },
+ [IFLA_VFINFO_LIST] = {. type = NLA_NESTED },
+ [IFLA_VF_PORTS] = { .type = NLA_NESTED },
+ [IFLA_PORT_SELF] = { .type = NLA_NESTED },
+ [IFLA_AF_SPEC] = { .type = NLA_NESTED },
+ [IFLA_EXT_MASK] = { .type = NLA_U32 },
+ [IFLA_PROMISCUITY] = { .type = NLA_U32 },
+ [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 },
+ [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 },
+ [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 },
+ [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 },
+ [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
+ [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
+ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
+ [IFLA_LINK_NETNSID] = { .type = NLA_S32 },
+ [IFLA_PROTO_DOWN] = { .type = NLA_U8 },
+ [IFLA_XDP] = { .type = NLA_NESTED },
+ [IFLA_EVENT] = { .type = NLA_U32 },
+ [IFLA_GROUP] = { .type = NLA_U32 },
+ [IFLA_IF_NETNSID] = { .type = NLA_S32 },
+ [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 },
+ [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
+ [IFLA_MIN_MTU] = { .type = NLA_U32 },
+ [IFLA_MAX_MTU] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
+ [IFLA_INFO_KIND] = { .type = NLA_STRING },
+ [IFLA_INFO_DATA] = { .type = NLA_NESTED },
+ [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING },
+ [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
+ [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) },
+ [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) },
+ [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED },
+ [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) },
+ [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) },
+ [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) },
+ [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) },
+ [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) },
+ [IFLA_VF_STATS] = { .type = NLA_NESTED },
+ [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) },
+ [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) },
+ [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) },
+};
+
+static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
+ [IFLA_PORT_VF] = { .type = NLA_U32 },
+ [IFLA_PORT_PROFILE] = { .type = NLA_STRING,
+ .len = PORT_PROFILE_MAX },
+ [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY,
+ .len = PORT_UUID_MAX },
+ [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING,
+ .len = PORT_UUID_MAX },
+ [IFLA_PORT_REQUEST] = { .type = NLA_U8, },
+ [IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
+
+ /* Unused, but we need to keep it here since user space could
+ * fill it. It's also broken with regard to NLA_BINARY use in
+ * combination with structs.
+ */
+ [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_port_vsi) },
+};
+
+static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
+ [IFLA_XDP_FD] = { .type = NLA_S32 },
+ [IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
+ [IFLA_XDP_FLAGS] = { .type = NLA_U32 },
+ [IFLA_XDP_PROG_ID] = { .type = NLA_U32 },
+};
+
+static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
+{
+ const struct rtnl_link_ops *ops = NULL;
+ struct nlattr *linfo[IFLA_INFO_MAX + 1];
+
+ if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla,
+ ifla_info_policy, NULL) < 0)
+ return NULL;
+
+ if (linfo[IFLA_INFO_KIND]) {
+ char kind[MODULE_NAME_LEN];
+
+ nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
+ ops = rtnl_link_ops_get(kind);
+ }
+
+ return ops;
+}
+
+static bool link_master_filtered(struct net_device *dev, int master_idx)
+{
+ struct net_device *master;
+
+ if (!master_idx)
+ return false;
+
+ master = netdev_master_upper_dev_get(dev);
+ if (!master || master->ifindex != master_idx)
+ return true;
+
+ return false;
+}
+
+static bool link_kind_filtered(const struct net_device *dev,
+ const struct rtnl_link_ops *kind_ops)
+{
+ if (kind_ops && dev->rtnl_link_ops != kind_ops)
+ return true;
+
+ return false;
+}
+
+static bool link_dump_filtered(struct net_device *dev,
+ int master_idx,
+ const struct rtnl_link_ops *kind_ops)
+{
+ if (link_master_filtered(dev, master_idx) ||
+ link_kind_filtered(dev, kind_ops))
+ return true;
+
+ return false;
+}
+
+static struct net *get_target_net(struct sock *sk, int netnsid)
+{
+ struct net *net;
+
+ net = get_net_ns_by_id(sock_net(sk), netnsid);
+ if (!net)
+ return ERR_PTR(-EINVAL);
+
+ /* For now, the caller is required to have CAP_NET_ADMIN in
+ * the user namespace owning the target net ns.
+ */
+ if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) {
+ put_net(net);
+ return ERR_PTR(-EACCES);
+ }
+ return net;
+}
+
+static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net *tgt_net = net;
+ int h, s_h;
+ int idx = 0, s_idx;
+ struct net_device *dev;
+ struct hlist_head *head;
+ struct nlattr *tb[IFLA_MAX+1];
+ u32 ext_filter_mask = 0;
+ const struct rtnl_link_ops *kind_ops = NULL;
+ unsigned int flags = NLM_F_MULTI;
+ int master_idx = 0;
+ int netnsid = -1;
+ int err;
+ int hdrlen;
+
+ s_h = cb->args[0];
+ s_idx = cb->args[1];
+
+ /* A hack to preserve kernel<->userspace interface.
+ * The correct header is ifinfomsg. It is consistent with rtnl_getlink.
+ * However, before Linux v3.9 the code here assumed rtgenmsg and that's
+ * what iproute2 < v3.9.0 used.
+ * We can detect the old iproute2. Even including the IFLA_EXT_MASK
+ * attribute, its netlink message is shorter than struct ifinfomsg.
+ */
+ hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ?
+ sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
+
+ if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX,
+ ifla_policy, NULL) >= 0) {
+ if (tb[IFLA_IF_NETNSID]) {
+ netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
+ tgt_net = get_target_net(skb->sk, netnsid);
+ if (IS_ERR(tgt_net))
+ return PTR_ERR(tgt_net);
+ }
+
+ if (tb[IFLA_EXT_MASK])
+ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+
+ if (tb[IFLA_MASTER])
+ master_idx = nla_get_u32(tb[IFLA_MASTER]);
+
+ if (tb[IFLA_LINKINFO])
+ kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]);
+
+ if (master_idx || kind_ops)
+ flags |= NLM_F_DUMP_FILTERED;
+ }
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &tgt_net->dev_index_head[h];
+ hlist_for_each_entry(dev, head, index_hlist) {
+ if (link_dump_filtered(dev, master_idx, kind_ops))
+ goto cont;
+ if (idx < s_idx)
+ goto cont;
+ err = rtnl_fill_ifinfo(skb, dev, net,
+ RTM_NEWLINK,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, 0,
+ flags,
+ ext_filter_mask, 0, NULL, 0,
+ netnsid, GFP_KERNEL);
+
+ if (err < 0) {
+ if (likely(skb->len))
+ goto out;
+
+ goto out_err;
+ }
+cont:
+ idx++;
+ }
+ }
+out:
+ err = skb->len;
+out_err:
+ cb->args[1] = idx;
+ cb->args[0] = h;
+ cb->seq = net->dev_base_seq;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ if (netnsid >= 0)
+ put_net(tgt_net);
+
+ return err;
+}
+
+int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len,
+ struct netlink_ext_ack *exterr)
+{
+ return nla_parse(tb, IFLA_MAX, head, len, ifla_policy, exterr);
+}
+EXPORT_SYMBOL(rtnl_nla_parse_ifla);
+
+struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
+{
+ struct net *net;
+ /* Examine the link attributes and figure out which
+ * network namespace we are talking about.
+ */
+ if (tb[IFLA_NET_NS_PID])
+ net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
+ else if (tb[IFLA_NET_NS_FD])
+ net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
+ else
+ net = get_net(src_net);
+ return net;
+}
+EXPORT_SYMBOL(rtnl_link_get_net);
+
+/* Figure out which network namespace we are talking about by
+ * examining the link attributes in the following order:
+ *
+ * 1. IFLA_NET_NS_PID
+ * 2. IFLA_NET_NS_FD
+ * 3. IFLA_IF_NETNSID
+ */
+static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net,
+ struct nlattr *tb[])
+{
+ struct net *net;
+
+ if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])
+ return rtnl_link_get_net(src_net, tb);
+
+ if (!tb[IFLA_IF_NETNSID])
+ return get_net(src_net);
+
+ net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_IF_NETNSID]));
+ if (!net)
+ return ERR_PTR(-EINVAL);
+
+ return net;
+}
+
+static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb,
+ struct net *src_net,
+ struct nlattr *tb[], int cap)
+{
+ struct net *net;
+
+ net = rtnl_link_get_net_by_nlattr(src_net, tb);
+ if (IS_ERR(net))
+ return net;
+
+ if (!netlink_ns_capable(skb, net->user_ns, cap)) {
+ put_net(net);
+ return ERR_PTR(-EPERM);
+ }
+
+ return net;
+}
+
+/* Verify that rtnetlink requests do not pass additional properties
+ * potentially referring to different network namespaces.
+ */
+static int rtnl_ensure_unique_netns(struct nlattr *tb[],
+ struct netlink_ext_ack *extack,
+ bool netns_id_only)
+{
+
+ if (netns_id_only) {
+ if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD])
+ return 0;
+
+ NL_SET_ERR_MSG(extack, "specified netns attribute not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[IFLA_IF_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]))
+ goto invalid_attr;
+
+ if (tb[IFLA_NET_NS_PID] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_FD]))
+ goto invalid_attr;
+
+ if (tb[IFLA_NET_NS_FD] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_PID]))
+ goto invalid_attr;
+
+ return 0;
+
+invalid_attr:
+ NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified");
+ return -EINVAL;
+}
+
+static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
+{
+ if (dev) {
+ if (tb[IFLA_ADDRESS] &&
+ nla_len(tb[IFLA_ADDRESS]) < dev->addr_len)
+ return -EINVAL;
+
+ if (tb[IFLA_BROADCAST] &&
+ nla_len(tb[IFLA_BROADCAST]) < dev->addr_len)
+ return -EINVAL;
+ }
+
+ if (tb[IFLA_AF_SPEC]) {
+ struct nlattr *af;
+ int rem, err;
+
+ nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
+ const struct rtnl_af_ops *af_ops;
+
+ rcu_read_lock();
+ af_ops = rtnl_af_lookup(nla_type(af));
+ if (!af_ops) {
+ rcu_read_unlock();
+ return -EAFNOSUPPORT;
+ }
+
+ if (!af_ops->set_link_af) {
+ rcu_read_unlock();
+ return -EOPNOTSUPP;
+ }
+
+ if (af_ops->validate_link_af) {
+ err = af_ops->validate_link_af(dev, af);
+ if (err < 0) {
+ rcu_read_unlock();
+ return err;
+ }
+ }
+
+ rcu_read_unlock();
+ }
+ }
+
+ return 0;
+}
+
+static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt,
+ int guid_type)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type);
+}
+
+static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type)
+{
+ if (dev->type != ARPHRD_INFINIBAND)
+ return -EOPNOTSUPP;
+
+ return handle_infiniband_guid(dev, ivt, guid_type);
+}
+
+static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int err = -EINVAL;
+
+ if (tb[IFLA_VF_MAC]) {
+ struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]);
+
+ if (ivm->vf >= INT_MAX)
+ return -EINVAL;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_mac)
+ err = ops->ndo_set_vf_mac(dev, ivm->vf,
+ ivm->mac);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_VLAN]) {
+ struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]);
+
+ if (ivv->vf >= INT_MAX)
+ return -EINVAL;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_vlan)
+ err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
+ ivv->qos,
+ htons(ETH_P_8021Q));
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_VLAN_LIST]) {
+ struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN];
+ struct nlattr *attr;
+ int rem, len = 0;
+
+ err = -EOPNOTSUPP;
+ if (!ops->ndo_set_vf_vlan)
+ return err;
+
+ nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) {
+ if (nla_type(attr) != IFLA_VF_VLAN_INFO ||
+ nla_len(attr) < NLA_HDRLEN) {
+ return -EINVAL;
+ }
+ if (len >= MAX_VLAN_LIST_LEN)
+ return -EOPNOTSUPP;
+ ivvl[len] = nla_data(attr);
+
+ len++;
+ }
+ if (len == 0)
+ return -EINVAL;
+
+ if (ivvl[0]->vf >= INT_MAX)
+ return -EINVAL;
+ err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
+ ivvl[0]->qos, ivvl[0]->vlan_proto);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_TX_RATE]) {
+ struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]);
+ struct ifla_vf_info ivf;
+
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_get_vf_config)
+ err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf);
+ if (err < 0)
+ return err;
+
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_rate)
+ err = ops->ndo_set_vf_rate(dev, ivt->vf,
+ ivf.min_tx_rate,
+ ivt->rate);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_RATE]) {
+ struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]);
+
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_rate)
+ err = ops->ndo_set_vf_rate(dev, ivt->vf,
+ ivt->min_tx_rate,
+ ivt->max_tx_rate);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_SPOOFCHK]) {
+ struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]);
+
+ if (ivs->vf >= INT_MAX)
+ return -EINVAL;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_spoofchk)
+ err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
+ ivs->setting);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_LINK_STATE]) {
+ struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]);
+
+ if (ivl->vf >= INT_MAX)
+ return -EINVAL;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_link_state)
+ err = ops->ndo_set_vf_link_state(dev, ivl->vf,
+ ivl->link_state);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_RSS_QUERY_EN]) {
+ struct ifla_vf_rss_query_en *ivrssq_en;
+
+ err = -EOPNOTSUPP;
+ ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]);
+ if (ivrssq_en->vf >= INT_MAX)
+ return -EINVAL;
+ if (ops->ndo_set_vf_rss_query_en)
+ err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf,
+ ivrssq_en->setting);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_TRUST]) {
+ struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]);
+
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_trust)
+ err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_IB_NODE_GUID]) {
+ struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]);
+
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
+ if (!ops->ndo_set_vf_guid)
+ return -EOPNOTSUPP;
+ return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID);
+ }
+
+ if (tb[IFLA_VF_IB_PORT_GUID]) {
+ struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]);
+
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
+ if (!ops->ndo_set_vf_guid)
+ return -EOPNOTSUPP;
+
+ return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID);
+ }
+
+ return err;
+}
+
+static int do_set_master(struct net_device *dev, int ifindex,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
+ const struct net_device_ops *ops;
+ int err;
+
+ if (upper_dev) {
+ if (upper_dev->ifindex == ifindex)
+ return 0;
+ ops = upper_dev->netdev_ops;
+ if (ops->ndo_del_slave) {
+ err = ops->ndo_del_slave(upper_dev, dev);
+ if (err)
+ return err;
+ } else {
+ return -EOPNOTSUPP;
+ }
+ }
+
+ if (ifindex) {
+ upper_dev = __dev_get_by_index(dev_net(dev), ifindex);
+ if (!upper_dev)
+ return -EINVAL;
+ ops = upper_dev->netdev_ops;
+ if (ops->ndo_add_slave) {
+ err = ops->ndo_add_slave(upper_dev, dev, extack);
+ if (err)
+ return err;
+ } else {
+ return -EOPNOTSUPP;
+ }
+ }
+ return 0;
+}
+
+#define DO_SETLINK_MODIFIED 0x01
+/* notify flag means notify + modified. */
+#define DO_SETLINK_NOTIFY 0x03
+static int do_setlink(const struct sk_buff *skb,
+ struct net_device *dev, struct ifinfomsg *ifm,
+ struct netlink_ext_ack *extack,
+ struct nlattr **tb, char *ifname, int status)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
+
+ err = validate_linkmsg(dev, tb);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) {
+ struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
+ tb, CAP_NET_ADMIN);
+ if (IS_ERR(net)) {
+ err = PTR_ERR(net);
+ goto errout;
+ }
+
+ err = dev_change_net_namespace(dev, net, ifname);
+ put_net(net);
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ if (tb[IFLA_MAP]) {
+ struct rtnl_link_ifmap *u_map;
+ struct ifmap k_map;
+
+ if (!ops->ndo_set_config) {
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if (!netif_device_present(dev)) {
+ err = -ENODEV;
+ goto errout;
+ }
+
+ u_map = nla_data(tb[IFLA_MAP]);
+ k_map.mem_start = (unsigned long) u_map->mem_start;
+ k_map.mem_end = (unsigned long) u_map->mem_end;
+ k_map.base_addr = (unsigned short) u_map->base_addr;
+ k_map.irq = (unsigned char) u_map->irq;
+ k_map.dma = (unsigned char) u_map->dma;
+ k_map.port = (unsigned char) u_map->port;
+
+ err = ops->ndo_set_config(dev, &k_map);
+ if (err < 0)
+ goto errout;
+
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ if (tb[IFLA_ADDRESS]) {
+ struct sockaddr *sa;
+ int len;
+
+ len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len,
+ sizeof(*sa));
+ sa = kmalloc(len, GFP_KERNEL);
+ if (!sa) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ sa->sa_family = dev->type;
+ memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
+ dev->addr_len);
+ err = dev_set_mac_address(dev, sa);
+ kfree(sa);
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ if (tb[IFLA_MTU]) {
+ err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
+ if (err < 0)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ if (tb[IFLA_GROUP]) {
+ dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ /*
+ * Interface selected by interface index but interface
+ * name provided implies that a name change has been
+ * requested.
+ */
+ if (ifm->ifi_index > 0 && ifname[0]) {
+ err = dev_change_name(dev, ifname);
+ if (err < 0)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ if (tb[IFLA_IFALIAS]) {
+ err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]),
+ nla_len(tb[IFLA_IFALIAS]));
+ if (err < 0)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ if (tb[IFLA_BROADCAST]) {
+ nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ }
+
+ if (ifm->ifi_flags || ifm->ifi_change) {
+ err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
+ if (err < 0)
+ goto errout;
+ }
+
+ if (tb[IFLA_MASTER]) {
+ err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ if (tb[IFLA_CARRIER]) {
+ err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ if (tb[IFLA_TXQLEN]) {
+ unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]);
+
+ err = dev_change_tx_queue_len(dev, value);
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ if (tb[IFLA_GSO_MAX_SIZE]) {
+ u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]);
+
+ if (max_size > GSO_MAX_SIZE) {
+ err = -EINVAL;
+ goto errout;
+ }
+
+ if (dev->gso_max_size ^ max_size) {
+ netif_set_gso_max_size(dev, max_size);
+ status |= DO_SETLINK_MODIFIED;
+ }
+ }
+
+ if (tb[IFLA_GSO_MAX_SEGS]) {
+ u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
+
+ if (max_segs > GSO_MAX_SEGS) {
+ err = -EINVAL;
+ goto errout;
+ }
+
+ if (dev->gso_max_segs ^ max_segs) {
+ dev->gso_max_segs = max_segs;
+ status |= DO_SETLINK_MODIFIED;
+ }
+ }
+
+ if (tb[IFLA_OPERSTATE])
+ set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
+
+ if (tb[IFLA_LINKMODE]) {
+ unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]);
+
+ write_lock_bh(&dev_base_lock);
+ if (dev->link_mode ^ value)
+ status |= DO_SETLINK_NOTIFY;
+ dev->link_mode = value;
+ write_unlock_bh(&dev_base_lock);
+ }
+
+ if (tb[IFLA_VFINFO_LIST]) {
+ struct nlattr *vfinfo[IFLA_VF_MAX + 1];
+ struct nlattr *attr;
+ int rem;
+
+ nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
+ if (nla_type(attr) != IFLA_VF_INFO ||
+ nla_len(attr) < NLA_HDRLEN) {
+ err = -EINVAL;
+ goto errout;
+ }
+ err = nla_parse_nested(vfinfo, IFLA_VF_MAX, attr,
+ ifla_vf_policy, NULL);
+ if (err < 0)
+ goto errout;
+ err = do_setvfinfo(dev, vfinfo);
+ if (err < 0)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+ }
+ err = 0;
+
+ if (tb[IFLA_VF_PORTS]) {
+ struct nlattr *port[IFLA_PORT_MAX+1];
+ struct nlattr *attr;
+ int vf;
+ int rem;
+
+ err = -EOPNOTSUPP;
+ if (!ops->ndo_set_vf_port)
+ goto errout;
+
+ nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) {
+ if (nla_type(attr) != IFLA_VF_PORT ||
+ nla_len(attr) < NLA_HDRLEN) {
+ err = -EINVAL;
+ goto errout;
+ }
+ err = nla_parse_nested(port, IFLA_PORT_MAX, attr,
+ ifla_port_policy, NULL);
+ if (err < 0)
+ goto errout;
+ if (!port[IFLA_PORT_VF]) {
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+ vf = nla_get_u32(port[IFLA_PORT_VF]);
+ err = ops->ndo_set_vf_port(dev, vf, port);
+ if (err < 0)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+ }
+ err = 0;
+
+ if (tb[IFLA_PORT_SELF]) {
+ struct nlattr *port[IFLA_PORT_MAX+1];
+
+ err = nla_parse_nested(port, IFLA_PORT_MAX,
+ tb[IFLA_PORT_SELF], ifla_port_policy,
+ NULL);
+ if (err < 0)
+ goto errout;
+
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_port)
+ err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port);
+ if (err < 0)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ if (tb[IFLA_AF_SPEC]) {
+ struct nlattr *af;
+ int rem;
+
+ nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
+ const struct rtnl_af_ops *af_ops;
+
+ rcu_read_lock();
+
+ BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af))));
+
+ err = af_ops->set_link_af(dev, af);
+ if (err < 0) {
+ rcu_read_unlock();
+ goto errout;
+ }
+
+ rcu_read_unlock();
+ status |= DO_SETLINK_NOTIFY;
+ }
+ }
+ err = 0;
+
+ if (tb[IFLA_PROTO_DOWN]) {
+ err = dev_change_proto_down(dev,
+ nla_get_u8(tb[IFLA_PROTO_DOWN]));
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ if (tb[IFLA_XDP]) {
+ struct nlattr *xdp[IFLA_XDP_MAX + 1];
+ u32 xdp_flags = 0;
+
+ err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP],
+ ifla_xdp_policy, NULL);
+ if (err < 0)
+ goto errout;
+
+ if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) {
+ err = -EINVAL;
+ goto errout;
+ }
+
+ if (xdp[IFLA_XDP_FLAGS]) {
+ xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]);
+ if (xdp_flags & ~XDP_FLAGS_MASK) {
+ err = -EINVAL;
+ goto errout;
+ }
+ if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) {
+ err = -EINVAL;
+ goto errout;
+ }
+ }
+
+ if (xdp[IFLA_XDP_FD]) {
+ err = dev_change_xdp_fd(dev, extack,
+ nla_get_s32(xdp[IFLA_XDP_FD]),
+ xdp_flags);
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+ }
+
+errout:
+ if (status & DO_SETLINK_MODIFIED) {
+ if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY)
+ netdev_state_change(dev);
+
+ if (err < 0)
+ net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",
+ dev->name);
+ }
+
+ return err;
+}
+
+static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct net_device *dev;
+ int err;
+ struct nlattr *tb[IFLA_MAX+1];
+ char ifname[IFNAMSIZ];
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy,
+ extack);
+ if (err < 0)
+ goto errout;
+
+ err = rtnl_ensure_unique_netns(tb, extack, false);
+ if (err < 0)
+ goto errout;
+
+ if (tb[IFLA_IFNAME])
+ nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ else
+ ifname[0] = '\0';
+
+ err = -EINVAL;
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME])
+ dev = __dev_get_by_name(net, ifname);
+ else
+ goto errout;
+
+ if (dev == NULL) {
+ err = -ENODEV;
+ goto errout;
+ }
+
+ err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0);
+errout:
+ return err;
+}
+
+static int rtnl_group_dellink(const struct net *net, int group)
+{
+ struct net_device *dev, *aux;
+ LIST_HEAD(list_kill);
+ bool found = false;
+
+ if (!group)
+ return -EPERM;
+
+ for_each_netdev(net, dev) {
+ if (dev->group == group) {
+ const struct rtnl_link_ops *ops;
+
+ found = true;
+ ops = dev->rtnl_link_ops;
+ if (!ops || !ops->dellink)
+ return -EOPNOTSUPP;
+ }
+ }
+
+ if (!found)
+ return -ENODEV;
+
+ for_each_netdev_safe(net, dev, aux) {
+ if (dev->group == group) {
+ const struct rtnl_link_ops *ops;
+
+ ops = dev->rtnl_link_ops;
+ ops->dellink(dev, &list_kill);
+ }
+ }
+ unregister_netdevice_many(&list_kill);
+
+ return 0;
+}
+
+int rtnl_delete_link(struct net_device *dev)
+{
+ const struct rtnl_link_ops *ops;
+ LIST_HEAD(list_kill);
+
+ ops = dev->rtnl_link_ops;
+ if (!ops || !ops->dellink)
+ return -EOPNOTSUPP;
+
+ ops->dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rtnl_delete_link);
+
+static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net *tgt_net = net;
+ struct net_device *dev = NULL;
+ struct ifinfomsg *ifm;
+ char ifname[IFNAMSIZ];
+ struct nlattr *tb[IFLA_MAX+1];
+ int err;
+ int netnsid = -1;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack);
+ if (err < 0)
+ return err;
+
+ err = rtnl_ensure_unique_netns(tb, extack, true);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_IFNAME])
+ nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+
+ if (tb[IFLA_IF_NETNSID]) {
+ netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
+ tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);
+ if (IS_ERR(tgt_net))
+ return PTR_ERR(tgt_net);
+ }
+
+ err = -EINVAL;
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME])
+ dev = __dev_get_by_name(tgt_net, ifname);
+ else if (tb[IFLA_GROUP])
+ err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP]));
+ else
+ goto out;
+
+ if (!dev) {
+ if (tb[IFLA_IFNAME] || ifm->ifi_index > 0)
+ err = -ENODEV;
+
+ goto out;
+ }
+
+ err = rtnl_delete_link(dev);
+
+out:
+ if (netnsid >= 0)
+ put_net(tgt_net);
+
+ return err;
+}
+
+int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
+{
+ unsigned int old_flags;
+ int err;
+
+ old_flags = dev->flags;
+ if (ifm && (ifm->ifi_flags || ifm->ifi_change)) {
+ err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
+ if (err < 0)
+ return err;
+ }
+
+ if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
+ __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags));
+ } else {
+ dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
+ __dev_notify_flags(dev, old_flags, ~0U);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(rtnl_configure_link);
+
+struct net_device *rtnl_create_link(struct net *net,
+ const char *ifname, unsigned char name_assign_type,
+ const struct rtnl_link_ops *ops, struct nlattr *tb[])
+{
+ struct net_device *dev;
+ unsigned int num_tx_queues = 1;
+ unsigned int num_rx_queues = 1;
+
+ if (tb[IFLA_NUM_TX_QUEUES])
+ num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]);
+ else if (ops->get_num_tx_queues)
+ num_tx_queues = ops->get_num_tx_queues();
+
+ if (tb[IFLA_NUM_RX_QUEUES])
+ num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]);
+ else if (ops->get_num_rx_queues)
+ num_rx_queues = ops->get_num_rx_queues();
+
+ if (num_tx_queues < 1 || num_tx_queues > 4096)
+ return ERR_PTR(-EINVAL);
+
+ if (num_rx_queues < 1 || num_rx_queues > 4096)
+ return ERR_PTR(-EINVAL);
+
+ dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
+ ops->setup, num_tx_queues, num_rx_queues);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ dev_net_set(dev, net);
+ dev->rtnl_link_ops = ops;
+ dev->rtnl_link_state = RTNL_LINK_INITIALIZING;
+
+ if (tb[IFLA_MTU]) {
+ u32 mtu = nla_get_u32(tb[IFLA_MTU]);
+ int err;
+
+ err = dev_validate_mtu(dev, mtu, NULL);
+ if (err) {
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
+ dev->mtu = mtu;
+ }
+ if (tb[IFLA_ADDRESS]) {
+ memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]),
+ nla_len(tb[IFLA_ADDRESS]));
+ dev->addr_assign_type = NET_ADDR_SET;
+ }
+ if (tb[IFLA_BROADCAST])
+ memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]),
+ nla_len(tb[IFLA_BROADCAST]));
+ if (tb[IFLA_TXQLEN])
+ dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
+ if (tb[IFLA_OPERSTATE])
+ set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
+ if (tb[IFLA_LINKMODE])
+ dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
+ if (tb[IFLA_GROUP])
+ dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ if (tb[IFLA_GSO_MAX_SIZE])
+ netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
+ if (tb[IFLA_GSO_MAX_SEGS])
+ dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
+
+ return dev;
+}
+EXPORT_SYMBOL(rtnl_create_link);
+
+static int rtnl_group_changelink(const struct sk_buff *skb,
+ struct net *net, int group,
+ struct ifinfomsg *ifm,
+ struct netlink_ext_ack *extack,
+ struct nlattr **tb)
+{
+ struct net_device *dev, *aux;
+ int err;
+
+ for_each_netdev_safe(net, dev, aux) {
+ if (dev->group == group) {
+ err = do_setlink(skb, dev, ifm, extack, tb, NULL, 0);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ const struct rtnl_link_ops *ops;
+ const struct rtnl_link_ops *m_ops;
+ struct net_device *dev;
+ struct net_device *master_dev;
+ struct ifinfomsg *ifm;
+ char kind[MODULE_NAME_LEN];
+ char ifname[IFNAMSIZ];
+ struct nlattr *tb[IFLA_MAX+1];
+ struct nlattr *linkinfo[IFLA_INFO_MAX+1];
+ unsigned char name_assign_type = NET_NAME_USER;
+ int err;
+
+#ifdef CONFIG_MODULES
+replay:
+#endif
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack);
+ if (err < 0)
+ return err;
+
+ err = rtnl_ensure_unique_netns(tb, extack, false);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_IFNAME])
+ nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ else
+ ifname[0] = '\0';
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else {
+ if (ifname[0])
+ dev = __dev_get_by_name(net, ifname);
+ else
+ dev = NULL;
+ }
+
+ master_dev = NULL;
+ m_ops = NULL;
+ if (dev) {
+ master_dev = netdev_master_upper_dev_get(dev);
+ if (master_dev)
+ m_ops = master_dev->rtnl_link_ops;
+ }
+
+ err = validate_linkmsg(dev, tb);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_LINKINFO]) {
+ err = nla_parse_nested(linkinfo, IFLA_INFO_MAX,
+ tb[IFLA_LINKINFO], ifla_info_policy,
+ NULL);
+ if (err < 0)
+ return err;
+ } else
+ memset(linkinfo, 0, sizeof(linkinfo));
+
+ if (linkinfo[IFLA_INFO_KIND]) {
+ nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
+ ops = rtnl_link_ops_get(kind);
+ } else {
+ kind[0] = '\0';
+ ops = NULL;
+ }
+
+ if (1) {
+ struct nlattr *attr[RTNL_MAX_TYPE + 1];
+ struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
+ struct nlattr **data = NULL;
+ struct nlattr **slave_data = NULL;
+ struct net *dest_net, *link_net = NULL;
+
+ if (ops) {
+ if (ops->maxtype > RTNL_MAX_TYPE)
+ return -EINVAL;
+
+ if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
+ err = nla_parse_nested(attr, ops->maxtype,
+ linkinfo[IFLA_INFO_DATA],
+ ops->policy, NULL);
+ if (err < 0)
+ return err;
+ data = attr;
+ }
+ if (ops->validate) {
+ err = ops->validate(tb, data, extack);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ if (m_ops) {
+ if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
+ return -EINVAL;
+
+ if (m_ops->slave_maxtype &&
+ linkinfo[IFLA_INFO_SLAVE_DATA]) {
+ err = nla_parse_nested(slave_attr,
+ m_ops->slave_maxtype,
+ linkinfo[IFLA_INFO_SLAVE_DATA],
+ m_ops->slave_policy,
+ NULL);
+ if (err < 0)
+ return err;
+ slave_data = slave_attr;
+ }
+ }
+
+ if (dev) {
+ int status = 0;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ if (linkinfo[IFLA_INFO_DATA]) {
+ if (!ops || ops != dev->rtnl_link_ops ||
+ !ops->changelink)
+ return -EOPNOTSUPP;
+
+ err = ops->changelink(dev, tb, data, extack);
+ if (err < 0)
+ return err;
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
+ if (!m_ops || !m_ops->slave_changelink)
+ return -EOPNOTSUPP;
+
+ err = m_ops->slave_changelink(master_dev, dev,
+ tb, slave_data,
+ extack);
+ if (err < 0)
+ return err;
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ return do_setlink(skb, dev, ifm, extack, tb, ifname,
+ status);
+ }
+
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
+ return rtnl_group_changelink(skb, net,
+ nla_get_u32(tb[IFLA_GROUP]),
+ ifm, extack, tb);
+ return -ENODEV;
+ }
+
+ if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
+ return -EOPNOTSUPP;
+
+ if (!ops) {
+#ifdef CONFIG_MODULES
+ if (kind[0]) {
+ __rtnl_unlock();
+ request_module("rtnl-link-%s", kind);
+ rtnl_lock();
+ ops = rtnl_link_ops_get(kind);
+ if (ops)
+ goto replay;
+ }
+#endif
+ return -EOPNOTSUPP;
+ }
+
+ if (!ops->setup)
+ return -EOPNOTSUPP;
+
+ if (!ifname[0]) {
+ snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
+ name_assign_type = NET_NAME_ENUM;
+ }
+
+ dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
+ if (IS_ERR(dest_net))
+ return PTR_ERR(dest_net);
+
+ if (tb[IFLA_LINK_NETNSID]) {
+ int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
+
+ link_net = get_net_ns_by_id(dest_net, id);
+ if (!link_net) {
+ err = -EINVAL;
+ goto out;
+ }
+ err = -EPERM;
+ if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN))
+ goto out;
+ }
+
+ dev = rtnl_create_link(link_net ? : dest_net, ifname,
+ name_assign_type, ops, tb);
+ if (IS_ERR(dev)) {
+ err = PTR_ERR(dev);
+ goto out;
+ }
+
+ dev->ifindex = ifm->ifi_index;
+
+ if (ops->newlink) {
+ err = ops->newlink(link_net ? : net, dev, tb, data,
+ extack);
+ /* Drivers should call free_netdev() in ->destructor
+ * and unregister it on failure after registration
+ * so that device could be finally freed in rtnl_unlock.
+ */
+ if (err < 0) {
+ /* If device is not registered at all, free it now */
+ if (dev->reg_state == NETREG_UNINITIALIZED ||
+ dev->reg_state == NETREG_UNREGISTERED)
+ free_netdev(dev);
+ goto out;
+ }
+ } else {
+ err = register_netdevice(dev);
+ if (err < 0) {
+ free_netdev(dev);
+ goto out;
+ }
+ }
+ err = rtnl_configure_link(dev, ifm);
+ if (err < 0)
+ goto out_unregister;
+ if (link_net) {
+ err = dev_change_net_namespace(dev, dest_net, ifname);
+ if (err < 0)
+ goto out_unregister;
+ }
+ if (tb[IFLA_MASTER]) {
+ err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]),
+ extack);
+ if (err)
+ goto out_unregister;
+ }
+out:
+ if (link_net)
+ put_net(link_net);
+ put_net(dest_net);
+ return err;
+out_unregister:
+ if (ops->newlink) {
+ LIST_HEAD(list_kill);
+
+ ops->dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+ } else {
+ unregister_netdevice(dev);
+ }
+ goto out;
+ }
+}
+
+static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net *tgt_net = net;
+ struct ifinfomsg *ifm;
+ char ifname[IFNAMSIZ];
+ struct nlattr *tb[IFLA_MAX+1];
+ struct net_device *dev = NULL;
+ struct sk_buff *nskb;
+ int netnsid = -1;
+ int err;
+ u32 ext_filter_mask = 0;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack);
+ if (err < 0)
+ return err;
+
+ err = rtnl_ensure_unique_netns(tb, extack, true);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_IF_NETNSID]) {
+ netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
+ tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);
+ if (IS_ERR(tgt_net))
+ return PTR_ERR(tgt_net);
+ }
+
+ if (tb[IFLA_IFNAME])
+ nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+
+ if (tb[IFLA_EXT_MASK])
+ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+
+ err = -EINVAL;
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME])
+ dev = __dev_get_by_name(tgt_net, ifname);
+ else
+ goto out;
+
+ err = -ENODEV;
+ if (dev == NULL)
+ goto out;
+
+ err = -ENOBUFS;
+ nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL);
+ if (nskb == NULL)
+ goto out;
+
+ err = rtnl_fill_ifinfo(nskb, dev, net,
+ RTM_NEWLINK, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, 0, 0, ext_filter_mask,
+ 0, NULL, 0, netnsid, GFP_KERNEL);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in if_nlmsg_size */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(nskb);
+ } else
+ err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
+out:
+ if (netnsid >= 0)
+ put_net(tgt_net);
+
+ return err;
+}
+
+static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ struct nlattr *tb[IFLA_MAX+1];
+ u32 ext_filter_mask = 0;
+ u16 min_ifinfo_dump_size = 0;
+ int hdrlen;
+
+ /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */
+ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
+ sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
+
+ if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) {
+ if (tb[IFLA_EXT_MASK])
+ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+ }
+
+ if (!ext_filter_mask)
+ return NLMSG_GOODSIZE;
+ /*
+ * traverse the list of net devices and compute the minimum
+ * buffer size based upon the filter mask.
+ */
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size,
+ if_nlmsg_size(dev,
+ ext_filter_mask));
+ }
+ rcu_read_unlock();
+
+ return nlmsg_total_size(min_ifinfo_dump_size);
+}
+
+static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx;
+ int s_idx = cb->family;
+ int type = cb->nlh->nlmsg_type - RTM_BASE;
+
+ if (s_idx == 0)
+ s_idx = 1;
+
+ for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
+ struct rtnl_link **tab;
+ struct rtnl_link *link;
+ rtnl_dumpit_func dumpit;
+
+ if (idx < s_idx || idx == PF_PACKET)
+ continue;
+
+ if (type < 0 || type >= RTM_NR_MSGTYPES)
+ continue;
+
+ tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]);
+ if (!tab)
+ continue;
+
+ link = tab[type];
+ if (!link)
+ continue;
+
+ dumpit = link->dumpit;
+ if (!dumpit)
+ continue;
+
+ if (idx > s_idx) {
+ memset(&cb->args[0], 0, sizeof(cb->args));
+ cb->prev_seq = 0;
+ cb->seq = 0;
+ }
+ if (dumpit(skb, cb))
+ break;
+ }
+ cb->family = idx;
+
+ return skb->len;
+}
+
+struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
+ unsigned int change,
+ u32 event, gfp_t flags, int *new_nsid,
+ int new_ifindex)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+ size_t if_info_size;
+
+ skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), flags);
+ if (skb == NULL)
+ goto errout;
+
+ err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
+ type, 0, 0, change, 0, 0, event,
+ new_nsid, new_ifindex, -1, flags);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in if_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ return skb;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+ return NULL;
+}
+
+void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
+{
+ struct net *net = dev_net(dev);
+
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
+}
+
+static void rtmsg_ifinfo_event(int type, struct net_device *dev,
+ unsigned int change, u32 event,
+ gfp_t flags, int *new_nsid, int new_ifindex)
+{
+ struct sk_buff *skb;
+
+ if (dev->reg_state != NETREG_REGISTERED)
+ return;
+
+ skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid,
+ new_ifindex);
+ if (skb)
+ rtmsg_ifinfo_send(skb, dev, flags);
+}
+
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
+ gfp_t flags)
+{
+ rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
+ NULL, 0);
+}
+
+void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
+ gfp_t flags, int *new_nsid, int new_ifindex)
+{
+ rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
+ new_nsid, new_ifindex);
+}
+
+static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
+ struct net_device *dev,
+ u8 *addr, u16 vid, u32 pid, u32 seq,
+ int type, unsigned int flags,
+ int nlflags, u16 ndm_state)
+{
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = AF_BRIDGE;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = flags;
+ ndm->ndm_type = 0;
+ ndm->ndm_ifindex = dev->ifindex;
+ ndm->ndm_state = ndm_state;
+
+ if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
+ goto nla_put_failure;
+ if (vid)
+ if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static inline size_t rtnl_fdb_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg)) +
+ nla_total_size(ETH_ALEN) + /* NDA_LLADDR */
+ nla_total_size(sizeof(u16)) + /* NDA_VLAN */
+ 0;
+}
+
+static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type,
+ u16 ndm_state)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(rtnl_fdb_nlmsg_size(), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = nlmsg_populate_fdb_fill(skb, dev, addr, vid,
+ 0, 0, type, NTF_SELF, 0, ndm_state);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ return;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+/**
+ * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry
+ */
+int ndo_dflt_fdb_add(struct ndmsg *ndm,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr, u16 vid,
+ u16 flags)
+{
+ int err = -EINVAL;
+
+ /* If aging addresses are supported device will need to
+ * implement its own handler for this.
+ */
+ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
+ pr_info("%s: FDB only supports static addresses\n", dev->name);
+ return err;
+ }
+
+ if (vid) {
+ pr_info("%s: vlans aren't supported yet for dev_uc|mc_add()\n", dev->name);
+ return err;
+ }
+
+ if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+ err = dev_uc_add_excl(dev, addr);
+ else if (is_multicast_ether_addr(addr))
+ err = dev_mc_add_excl(dev, addr);
+
+ /* Only return duplicate errors if NLM_F_EXCL is set */
+ if (err == -EEXIST && !(flags & NLM_F_EXCL))
+ err = 0;
+
+ return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_add);
+
+static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid,
+ struct netlink_ext_ack *extack)
+{
+ u16 vid = 0;
+
+ if (vlan_attr) {
+ if (nla_len(vlan_attr) != sizeof(u16)) {
+ NL_SET_ERR_MSG(extack, "invalid vlan attribute size");
+ return -EINVAL;
+ }
+
+ vid = nla_get_u16(vlan_attr);
+
+ if (!vid || vid >= VLAN_VID_MASK) {
+ NL_SET_ERR_MSG(extack, "invalid vlan id");
+ return -EINVAL;
+ }
+ }
+ *p_vid = vid;
+ return 0;
+}
+
+static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *tb[NDA_MAX+1];
+ struct net_device *dev;
+ u8 *addr;
+ u16 vid;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
+ if (err < 0)
+ return err;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex == 0) {
+ NL_SET_ERR_MSG(extack, "invalid ifindex");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ NL_SET_ERR_MSG(extack, "unknown ifindex");
+ return -ENODEV;
+ }
+
+ if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
+ NL_SET_ERR_MSG(extack, "invalid address");
+ return -EINVAL;
+ }
+
+ if (dev->type != ARPHRD_ETHER) {
+ NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices");
+ return -EINVAL;
+ }
+
+ addr = nla_data(tb[NDA_LLADDR]);
+
+ err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
+ if (err)
+ return err;
+
+ err = -EOPNOTSUPP;
+
+ /* Support fdb on master device the net/bridge default case */
+ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
+ (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ const struct net_device_ops *ops = br_dev->netdev_ops;
+
+ err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
+ nlh->nlmsg_flags);
+ if (err)
+ goto out;
+ else
+ ndm->ndm_flags &= ~NTF_MASTER;
+ }
+
+ /* Embedded bridge, macvlan, and any other device support */
+ if ((ndm->ndm_flags & NTF_SELF)) {
+ if (dev->netdev_ops->ndo_fdb_add)
+ err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
+ vid,
+ nlh->nlmsg_flags);
+ else
+ err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid,
+ nlh->nlmsg_flags);
+
+ if (!err) {
+ rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH,
+ ndm->ndm_state);
+ ndm->ndm_flags &= ~NTF_SELF;
+ }
+ }
+out:
+ return err;
+}
+
+/**
+ * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry
+ */
+int ndo_dflt_fdb_del(struct ndmsg *ndm,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr, u16 vid)
+{
+ int err = -EINVAL;
+
+ /* If aging addresses are supported device will need to
+ * implement its own handler for this.
+ */
+ if (!(ndm->ndm_state & NUD_PERMANENT)) {
+ pr_info("%s: FDB only supports static addresses\n", dev->name);
+ return err;
+ }
+
+ if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+ err = dev_uc_del(dev, addr);
+ else if (is_multicast_ether_addr(addr))
+ err = dev_mc_del(dev, addr);
+
+ return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_del);
+
+static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *tb[NDA_MAX+1];
+ struct net_device *dev;
+ int err = -EINVAL;
+ __u8 *addr;
+ u16 vid;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
+ if (err < 0)
+ return err;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex == 0) {
+ NL_SET_ERR_MSG(extack, "invalid ifindex");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ NL_SET_ERR_MSG(extack, "unknown ifindex");
+ return -ENODEV;
+ }
+
+ if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
+ NL_SET_ERR_MSG(extack, "invalid address");
+ return -EINVAL;
+ }
+
+ if (dev->type != ARPHRD_ETHER) {
+ NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices");
+ return -EINVAL;
+ }
+
+ addr = nla_data(tb[NDA_LLADDR]);
+
+ err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
+ if (err)
+ return err;
+
+ err = -EOPNOTSUPP;
+
+ /* Support fdb on master device the net/bridge default case */
+ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
+ (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ const struct net_device_ops *ops = br_dev->netdev_ops;
+
+ if (ops->ndo_fdb_del)
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid);
+
+ if (err)
+ goto out;
+ else
+ ndm->ndm_flags &= ~NTF_MASTER;
+ }
+
+ /* Embedded bridge, macvlan, and any other device support */
+ if (ndm->ndm_flags & NTF_SELF) {
+ if (dev->netdev_ops->ndo_fdb_del)
+ err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr,
+ vid);
+ else
+ err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
+
+ if (!err) {
+ rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH,
+ ndm->ndm_state);
+ ndm->ndm_flags &= ~NTF_SELF;
+ }
+ }
+out:
+ return err;
+}
+
+static int nlmsg_populate_fdb(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct net_device *dev,
+ int *idx,
+ struct netdev_hw_addr_list *list)
+{
+ struct netdev_hw_addr *ha;
+ int err;
+ u32 portid, seq;
+
+ portid = NETLINK_CB(cb->skb).portid;
+ seq = cb->nlh->nlmsg_seq;
+
+ list_for_each_entry(ha, &list->list, list) {
+ if (*idx < cb->args[2])
+ goto skip;
+
+ err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
+ portid, seq,
+ RTM_NEWNEIGH, NTF_SELF,
+ NLM_F_MULTI, NUD_PERMANENT);
+ if (err < 0)
+ return err;
+skip:
+ *idx += 1;
+ }
+ return 0;
+}
+
+/**
+ * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table.
+ * @nlh: netlink message header
+ * @dev: netdevice
+ *
+ * Default netdevice operation to dump the existing unicast address list.
+ * Returns number of addresses from list put in skb.
+ */
+int ndo_dflt_fdb_dump(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct net_device *dev,
+ struct net_device *filter_dev,
+ int *idx)
+{
+ int err;
+
+ if (dev->type != ARPHRD_ETHER)
+ return -EINVAL;
+
+ netif_addr_lock_bh(dev);
+ err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc);
+ if (err)
+ goto out;
+ err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc);
+out:
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_dump);
+
+static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net_device *dev;
+ struct nlattr *tb[IFLA_MAX+1];
+ struct net_device *br_dev = NULL;
+ const struct net_device_ops *ops = NULL;
+ const struct net_device_ops *cops = NULL;
+ struct ifinfomsg *ifm = nlmsg_data(cb->nlh);
+ struct net *net = sock_net(skb->sk);
+ struct hlist_head *head;
+ int brport_idx = 0;
+ int br_idx = 0;
+ int h, s_h;
+ int idx = 0, s_idx;
+ int err = 0;
+ int fidx = 0;
+
+ /* A hack to preserve kernel<->userspace interface.
+ * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0.
+ * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails.
+ * So, check for ndmsg with an optional u32 attribute (not used here).
+ * Fortunately these sizes don't conflict with the size of ifinfomsg
+ * with an optional attribute.
+ */
+ if (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) &&
+ (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) +
+ nla_attr_size(sizeof(u32)))) {
+ err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb,
+ IFLA_MAX, ifla_policy, NULL);
+ if (err < 0) {
+ return -EINVAL;
+ } else if (err == 0) {
+ if (tb[IFLA_MASTER])
+ br_idx = nla_get_u32(tb[IFLA_MASTER]);
+ }
+
+ brport_idx = ifm->ifi_index;
+ }
+
+ if (br_idx) {
+ br_dev = __dev_get_by_index(net, br_idx);
+ if (!br_dev)
+ return -ENODEV;
+
+ ops = br_dev->netdev_ops;
+ }
+
+ s_h = cb->args[0];
+ s_idx = cb->args[1];
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ hlist_for_each_entry(dev, head, index_hlist) {
+
+ if (brport_idx && (dev->ifindex != brport_idx))
+ continue;
+
+ if (!br_idx) { /* user did not specify a specific bridge */
+ if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ br_dev = netdev_master_upper_dev_get(dev);
+ cops = br_dev->netdev_ops;
+ }
+ } else {
+ if (dev != br_dev &&
+ !(dev->priv_flags & IFF_BRIDGE_PORT))
+ continue;
+
+ if (br_dev != netdev_master_upper_dev_get(dev) &&
+ !(dev->priv_flags & IFF_EBRIDGE))
+ continue;
+ cops = ops;
+ }
+
+ if (idx < s_idx)
+ goto cont;
+
+ if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ if (cops && cops->ndo_fdb_dump) {
+ err = cops->ndo_fdb_dump(skb, cb,
+ br_dev, dev,
+ &fidx);
+ if (err == -EMSGSIZE)
+ goto out;
+ }
+ }
+
+ if (dev->netdev_ops->ndo_fdb_dump)
+ err = dev->netdev_ops->ndo_fdb_dump(skb, cb,
+ dev, NULL,
+ &fidx);
+ else
+ err = ndo_dflt_fdb_dump(skb, cb, dev, NULL,
+ &fidx);
+ if (err == -EMSGSIZE)
+ goto out;
+
+ cops = NULL;
+
+ /* reset fdb offset to 0 for rest of the interfaces */
+ cb->args[2] = 0;
+ fidx = 0;
+cont:
+ idx++;
+ }
+ }
+
+out:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+ cb->args[2] = fidx;
+
+ return skb->len;
+}
+
+static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
+ unsigned int attrnum, unsigned int flag)
+{
+ if (mask & flag)
+ return nla_put_u8(skb, attrnum, !!(flags & flag));
+ return 0;
+}
+
+int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+ struct net_device *dev, u16 mode,
+ u32 flags, u32 mask, int nlflags,
+ u32 filter_mask,
+ int (*vlan_fill)(struct sk_buff *skb,
+ struct net_device *dev,
+ u32 filter_mask))
+{
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *ifm;
+ struct nlattr *br_afspec;
+ struct nlattr *protinfo;
+ u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ int err = 0;
+
+ nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifi_family = AF_BRIDGE;
+ ifm->__ifi_pad = 0;
+ ifm->ifi_type = dev->type;
+ ifm->ifi_index = dev->ifindex;
+ ifm->ifi_flags = dev_get_flags(dev);
+ ifm->ifi_change = 0;
+
+
+ if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
+ nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+ nla_put_u8(skb, IFLA_OPERSTATE, operstate) ||
+ (br_dev &&
+ nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) ||
+ (dev->addr_len &&
+ nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
+ (dev->ifindex != dev_get_iflink(dev) &&
+ nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))))
+ goto nla_put_failure;
+
+ br_afspec = nla_nest_start(skb, IFLA_AF_SPEC);
+ if (!br_afspec)
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) {
+ nla_nest_cancel(skb, br_afspec);
+ goto nla_put_failure;
+ }
+
+ if (mode != BRIDGE_MODE_UNDEF) {
+ if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) {
+ nla_nest_cancel(skb, br_afspec);
+ goto nla_put_failure;
+ }
+ }
+ if (vlan_fill) {
+ err = vlan_fill(skb, dev, filter_mask);
+ if (err) {
+ nla_nest_cancel(skb, br_afspec);
+ goto nla_put_failure;
+ }
+ }
+ nla_nest_end(skb, br_afspec);
+
+ protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED);
+ if (!protinfo)
+ goto nla_put_failure;
+
+ if (brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_GUARD, BR_BPDU_GUARD) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_FAST_LEAVE,
+ BR_MULTICAST_FAST_LEAVE) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_LEARNING, BR_LEARNING) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_PROXYARP, BR_PROXYARP)) {
+ nla_nest_cancel(skb, protinfo);
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, protinfo);
+
+ nlmsg_end(skb, nlh);
+ return 0;
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return err ? err : -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink);
+
+static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int idx = 0;
+ u32 portid = NETLINK_CB(cb->skb).portid;
+ u32 seq = cb->nlh->nlmsg_seq;
+ u32 filter_mask = 0;
+ int err;
+
+ if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) {
+ struct nlattr *extfilt;
+
+ extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg),
+ IFLA_EXT_MASK);
+ if (extfilt) {
+ if (nla_len(extfilt) < sizeof(filter_mask))
+ return -EINVAL;
+
+ filter_mask = nla_get_u32(extfilt);
+ }
+ }
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) {
+ if (idx >= cb->args[0]) {
+ err = br_dev->netdev_ops->ndo_bridge_getlink(
+ skb, portid, seq, dev,
+ filter_mask, NLM_F_MULTI);
+ if (err < 0 && err != -EOPNOTSUPP) {
+ if (likely(skb->len))
+ break;
+
+ goto out_err;
+ }
+ }
+ idx++;
+ }
+
+ if (ops->ndo_bridge_getlink) {
+ if (idx >= cb->args[0]) {
+ err = ops->ndo_bridge_getlink(skb, portid,
+ seq, dev,
+ filter_mask,
+ NLM_F_MULTI);
+ if (err < 0 && err != -EOPNOTSUPP) {
+ if (likely(skb->len))
+ break;
+
+ goto out_err;
+ }
+ }
+ idx++;
+ }
+ }
+ err = skb->len;
+out_err:
+ rcu_read_unlock();
+ cb->args[0] = idx;
+
+ return err;
+}
+
+static inline size_t bridge_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ + nla_total_size(sizeof(u32)) /* IFLA_MASTER */
+ + nla_total_size(sizeof(u32)) /* IFLA_MTU */
+ + nla_total_size(sizeof(u32)) /* IFLA_LINK */
+ + nla_total_size(sizeof(u32)) /* IFLA_OPERSTATE */
+ + nla_total_size(sizeof(u8)) /* IFLA_PROTINFO */
+ + nla_total_size(sizeof(struct nlattr)) /* IFLA_AF_SPEC */
+ + nla_total_size(sizeof(u16)) /* IFLA_BRIDGE_FLAGS */
+ + nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_MODE */
+}
+
+static int rtnl_bridge_notify(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -EOPNOTSUPP;
+
+ if (!dev->netdev_ops->ndo_bridge_getlink)
+ return 0;
+
+ skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC);
+ if (!skb) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0);
+ if (err < 0)
+ goto errout;
+
+ /* Notification info is only filled for bridge ports, not the bridge
+ * device itself. Therefore, a zero notification length is valid and
+ * should not result in an error.
+ */
+ if (!skb->len)
+ goto errout;
+
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+ return 0;
+errout:
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ if (err)
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+ return err;
+}
+
+static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct net_device *dev;
+ struct nlattr *br_spec, *attr = NULL;
+ int rem, err = -EOPNOTSUPP;
+ u16 flags = 0;
+ bool have_flags = false;
+
+ if (nlmsg_len(nlh) < sizeof(*ifm))
+ return -EINVAL;
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_family != AF_BRIDGE)
+ return -EPFNOSUPPORT;
+
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "unknown ifindex");
+ return -ENODEV;
+ }
+
+ br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (br_spec) {
+ nla_for_each_nested(attr, br_spec, rem) {
+ if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
+ if (nla_len(attr) < sizeof(flags))
+ return -EINVAL;
+
+ have_flags = true;
+ flags = nla_get_u16(attr);
+ break;
+ }
+ }
+ }
+
+ if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags);
+ if (err)
+ goto out;
+
+ flags &= ~BRIDGE_FLAGS_MASTER;
+ }
+
+ if ((flags & BRIDGE_FLAGS_SELF)) {
+ if (!dev->netdev_ops->ndo_bridge_setlink)
+ err = -EOPNOTSUPP;
+ else
+ err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh,
+ flags);
+ if (!err) {
+ flags &= ~BRIDGE_FLAGS_SELF;
+
+ /* Generate event to notify upper layer of bridge
+ * change
+ */
+ err = rtnl_bridge_notify(dev);
+ }
+ }
+
+ if (have_flags)
+ memcpy(nla_data(attr), &flags, sizeof(flags));
+out:
+ return err;
+}
+
+static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct net_device *dev;
+ struct nlattr *br_spec, *attr = NULL;
+ int rem, err = -EOPNOTSUPP;
+ u16 flags = 0;
+ bool have_flags = false;
+
+ if (nlmsg_len(nlh) < sizeof(*ifm))
+ return -EINVAL;
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_family != AF_BRIDGE)
+ return -EPFNOSUPPORT;
+
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "unknown ifindex");
+ return -ENODEV;
+ }
+
+ br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (br_spec) {
+ nla_for_each_nested(attr, br_spec, rem) {
+ if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
+ if (nla_len(attr) < sizeof(flags))
+ return -EINVAL;
+
+ have_flags = true;
+ flags = nla_get_u16(attr);
+ break;
+ }
+ }
+ }
+
+ if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags);
+ if (err)
+ goto out;
+
+ flags &= ~BRIDGE_FLAGS_MASTER;
+ }
+
+ if ((flags & BRIDGE_FLAGS_SELF)) {
+ if (!dev->netdev_ops->ndo_bridge_dellink)
+ err = -EOPNOTSUPP;
+ else
+ err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh,
+ flags);
+
+ if (!err) {
+ flags &= ~BRIDGE_FLAGS_SELF;
+
+ /* Generate event to notify upper layer of bridge
+ * change
+ */
+ err = rtnl_bridge_notify(dev);
+ }
+ }
+
+ if (have_flags)
+ memcpy(nla_data(attr), &flags, sizeof(flags));
+out:
+ return err;
+}
+
+static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
+{
+ return (mask & IFLA_STATS_FILTER_BIT(attrid)) &&
+ (!idxattr || idxattr == attrid);
+}
+
+#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1)
+static int rtnl_get_offload_stats_attr_size(int attr_id)
+{
+ switch (attr_id) {
+ case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+ return sizeof(struct rtnl_link_stats64);
+ }
+
+ return 0;
+}
+
+static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
+ int *prividx)
+{
+ struct nlattr *attr = NULL;
+ int attr_id, size;
+ void *attr_data;
+ int err;
+
+ if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+ dev->netdev_ops->ndo_get_offload_stats))
+ return -ENODATA;
+
+ for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+ attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+ if (attr_id < *prividx)
+ continue;
+
+ size = rtnl_get_offload_stats_attr_size(attr_id);
+ if (!size)
+ continue;
+
+ if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
+ continue;
+
+ attr = nla_reserve_64bit(skb, attr_id, size,
+ IFLA_OFFLOAD_XSTATS_UNSPEC);
+ if (!attr)
+ goto nla_put_failure;
+
+ attr_data = nla_data(attr);
+ memset(attr_data, 0, size);
+ err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev,
+ attr_data);
+ if (err)
+ goto get_offload_stats_failure;
+ }
+
+ if (!attr)
+ return -ENODATA;
+
+ *prividx = 0;
+ return 0;
+
+nla_put_failure:
+ err = -EMSGSIZE;
+get_offload_stats_failure:
+ *prividx = attr_id;
+ return err;
+}
+
+static int rtnl_get_offload_stats_size(const struct net_device *dev)
+{
+ int nla_size = 0;
+ int attr_id;
+ int size;
+
+ if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+ dev->netdev_ops->ndo_get_offload_stats))
+ return 0;
+
+ for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+ attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+ if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
+ continue;
+ size = rtnl_get_offload_stats_attr_size(attr_id);
+ nla_size += nla_total_size_64bit(size);
+ }
+
+ if (nla_size != 0)
+ nla_size += nla_total_size(0);
+
+ return nla_size;
+}
+
+static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
+ int type, u32 pid, u32 seq, u32 change,
+ unsigned int flags, unsigned int filter_mask,
+ int *idxattr, int *prividx)
+{
+ struct if_stats_msg *ifsm;
+ struct nlmsghdr *nlh;
+ struct nlattr *attr;
+ int s_prividx = *prividx;
+ int err;
+
+ ASSERT_RTNL();
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ifsm = nlmsg_data(nlh);
+ ifsm->family = PF_UNSPEC;
+ ifsm->pad1 = 0;
+ ifsm->pad2 = 0;
+ ifsm->ifindex = dev->ifindex;
+ ifsm->filter_mask = filter_mask;
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) {
+ struct rtnl_link_stats64 *sp;
+
+ attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64,
+ sizeof(struct rtnl_link_stats64),
+ IFLA_STATS_UNSPEC);
+ if (!attr)
+ goto nla_put_failure;
+
+ sp = nla_data(attr);
+ dev_get_stats(dev, sp);
+ }
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) {
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+
+ if (ops && ops->fill_linkxstats) {
+ *idxattr = IFLA_STATS_LINK_XSTATS;
+ attr = nla_nest_start(skb,
+ IFLA_STATS_LINK_XSTATS);
+ if (!attr)
+ goto nla_put_failure;
+
+ err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
+ nla_nest_end(skb, attr);
+ if (err)
+ goto nla_put_failure;
+ *idxattr = 0;
+ }
+ }
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE,
+ *idxattr)) {
+ const struct rtnl_link_ops *ops = NULL;
+ const struct net_device *master;
+
+ master = netdev_master_upper_dev_get(dev);
+ if (master)
+ ops = master->rtnl_link_ops;
+ if (ops && ops->fill_linkxstats) {
+ *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
+ attr = nla_nest_start(skb,
+ IFLA_STATS_LINK_XSTATS_SLAVE);
+ if (!attr)
+ goto nla_put_failure;
+
+ err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
+ nla_nest_end(skb, attr);
+ if (err)
+ goto nla_put_failure;
+ *idxattr = 0;
+ }
+ }
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
+ *idxattr)) {
+ *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
+ attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS);
+ if (!attr)
+ goto nla_put_failure;
+
+ err = rtnl_get_offload_stats(skb, dev, prividx);
+ if (err == -ENODATA)
+ nla_nest_cancel(skb, attr);
+ else
+ nla_nest_end(skb, attr);
+
+ if (err && err != -ENODATA)
+ goto nla_put_failure;
+ *idxattr = 0;
+ }
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) {
+ struct rtnl_af_ops *af_ops;
+
+ *idxattr = IFLA_STATS_AF_SPEC;
+ attr = nla_nest_start(skb, IFLA_STATS_AF_SPEC);
+ if (!attr)
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
+ if (af_ops->fill_stats_af) {
+ struct nlattr *af;
+ int err;
+
+ af = nla_nest_start(skb, af_ops->family);
+ if (!af) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+ err = af_ops->fill_stats_af(skb, dev);
+
+ if (err == -ENODATA) {
+ nla_nest_cancel(skb, af);
+ } else if (err < 0) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, af);
+ }
+ }
+ rcu_read_unlock();
+
+ nla_nest_end(skb, attr);
+
+ *idxattr = 0;
+ }
+
+ nlmsg_end(skb, nlh);
+
+ return 0;
+
+nla_put_failure:
+ /* not a multi message or no progress mean a real error */
+ if (!(flags & NLM_F_MULTI) || s_prividx == *prividx)
+ nlmsg_cancel(skb, nlh);
+ else
+ nlmsg_end(skb, nlh);
+
+ return -EMSGSIZE;
+}
+
+static size_t if_nlmsg_stats_size(const struct net_device *dev,
+ u32 filter_mask)
+{
+ size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg));
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0))
+ size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64));
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) {
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+ int attr = IFLA_STATS_LINK_XSTATS;
+
+ if (ops && ops->get_linkxstats_size) {
+ size += nla_total_size(ops->get_linkxstats_size(dev,
+ attr));
+ /* for IFLA_STATS_LINK_XSTATS */
+ size += nla_total_size(0);
+ }
+ }
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) {
+ struct net_device *_dev = (struct net_device *)dev;
+ const struct rtnl_link_ops *ops = NULL;
+ const struct net_device *master;
+
+ /* netdev_master_upper_dev_get can't take const */
+ master = netdev_master_upper_dev_get(_dev);
+ if (master)
+ ops = master->rtnl_link_ops;
+ if (ops && ops->get_linkxstats_size) {
+ int attr = IFLA_STATS_LINK_XSTATS_SLAVE;
+
+ size += nla_total_size(ops->get_linkxstats_size(dev,
+ attr));
+ /* for IFLA_STATS_LINK_XSTATS_SLAVE */
+ size += nla_total_size(0);
+ }
+ }
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
+ size += rtnl_get_offload_stats_size(dev);
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
+ struct rtnl_af_ops *af_ops;
+
+ /* for IFLA_STATS_AF_SPEC */
+ size += nla_total_size(0);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
+ if (af_ops->get_stats_af_size) {
+ size += nla_total_size(
+ af_ops->get_stats_af_size(dev));
+
+ /* for AF_* */
+ size += nla_total_size(0);
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ return size;
+}
+
+static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev = NULL;
+ int idxattr = 0, prividx = 0;
+ struct if_stats_msg *ifsm;
+ struct sk_buff *nskb;
+ u32 filter_mask;
+ int err;
+
+ if (nlmsg_len(nlh) < sizeof(*ifsm))
+ return -EINVAL;
+
+ ifsm = nlmsg_data(nlh);
+ if (ifsm->ifindex > 0)
+ dev = __dev_get_by_index(net, ifsm->ifindex);
+ else
+ return -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ filter_mask = ifsm->filter_mask;
+ if (!filter_mask)
+ return -EINVAL;
+
+ nskb = nlmsg_new(if_nlmsg_stats_size(dev, filter_mask), GFP_KERNEL);
+ if (!nskb)
+ return -ENOBUFS;
+
+ err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS,
+ NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ 0, filter_mask, &idxattr, &prividx);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(nskb);
+ } else {
+ err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
+ }
+
+ return err;
+}
+
+static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int h, s_h, err, s_idx, s_idxattr, s_prividx;
+ struct net *net = sock_net(skb->sk);
+ unsigned int flags = NLM_F_MULTI;
+ struct if_stats_msg *ifsm;
+ struct hlist_head *head;
+ struct net_device *dev;
+ u32 filter_mask = 0;
+ int idx = 0;
+
+ s_h = cb->args[0];
+ s_idx = cb->args[1];
+ s_idxattr = cb->args[2];
+ s_prividx = cb->args[3];
+
+ cb->seq = net->dev_base_seq;
+
+ if (nlmsg_len(cb->nlh) < sizeof(*ifsm))
+ return -EINVAL;
+
+ ifsm = nlmsg_data(cb->nlh);
+ filter_mask = ifsm->filter_mask;
+ if (!filter_mask)
+ return -EINVAL;
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ hlist_for_each_entry(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, 0,
+ flags, filter_mask,
+ &s_idxattr, &s_prividx);
+ /* If we ran out of room on the first message,
+ * we're in trouble
+ */
+ WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
+
+ if (err < 0)
+ goto out;
+ s_prividx = 0;
+ s_idxattr = 0;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+out:
+ cb->args[3] = s_prividx;
+ cb->args[2] = s_idxattr;
+ cb->args[1] = idx;
+ cb->args[0] = h;
+
+ return skb->len;
+}
+
+/* Process one rtnetlink message. */
+
+static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct rtnl_link *link;
+ struct module *owner;
+ int err = -EOPNOTSUPP;
+ rtnl_doit_func doit;
+ unsigned int flags;
+ int kind;
+ int family;
+ int type;
+
+ type = nlh->nlmsg_type;
+ if (type > RTM_MAX)
+ return -EOPNOTSUPP;
+
+ type -= RTM_BASE;
+
+ /* All the messages must have at least 1 byte length */
+ if (nlmsg_len(nlh) < sizeof(struct rtgenmsg))
+ return 0;
+
+ family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
+ kind = type&3;
+
+ if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ rcu_read_lock();
+ if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
+ struct sock *rtnl;
+ rtnl_dumpit_func dumpit;
+ u16 min_dump_alloc = 0;
+
+ link = rtnl_get_link(family, type);
+ if (!link || !link->dumpit) {
+ family = PF_UNSPEC;
+ link = rtnl_get_link(family, type);
+ if (!link || !link->dumpit)
+ goto err_unlock;
+ }
+ owner = link->owner;
+ dumpit = link->dumpit;
+
+ if (type == RTM_GETLINK - RTM_BASE)
+ min_dump_alloc = rtnl_calcit(skb, nlh);
+
+ err = 0;
+ /* need to do this before rcu_read_unlock() */
+ if (!try_module_get(owner))
+ err = -EPROTONOSUPPORT;
+
+ rcu_read_unlock();
+
+ rtnl = net->rtnl;
+ if (err == 0) {
+ struct netlink_dump_control c = {
+ .dump = dumpit,
+ .min_dump_alloc = min_dump_alloc,
+ .module = owner,
+ };
+ err = netlink_dump_start(rtnl, skb, nlh, &c);
+ /* netlink_dump_start() will keep a reference on
+ * module if dump is still in progress.
+ */
+ module_put(owner);
+ }
+ return err;
+ }
+
+ link = rtnl_get_link(family, type);
+ if (!link || !link->doit) {
+ family = PF_UNSPEC;
+ link = rtnl_get_link(PF_UNSPEC, type);
+ if (!link || !link->doit)
+ goto out_unlock;
+ }
+
+ owner = link->owner;
+ if (!try_module_get(owner)) {
+ err = -EPROTONOSUPPORT;
+ goto out_unlock;
+ }
+
+ flags = link->flags;
+ if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
+ doit = link->doit;
+ rcu_read_unlock();
+ if (doit)
+ err = doit(skb, nlh, extack);
+ module_put(owner);
+ return err;
+ }
+ rcu_read_unlock();
+
+ rtnl_lock();
+ link = rtnl_get_link(family, type);
+ if (link && link->doit)
+ err = link->doit(skb, nlh, extack);
+ rtnl_unlock();
+
+ module_put(owner);
+
+ return err;
+
+out_unlock:
+ rcu_read_unlock();
+ return err;
+
+err_unlock:
+ rcu_read_unlock();
+ return -EOPNOTSUPP;
+}
+
+static void rtnetlink_rcv(struct sk_buff *skb)
+{
+ netlink_rcv_skb(skb, &rtnetlink_rcv_msg);
+}
+
+static int rtnetlink_bind(struct net *net, int group)
+{
+ switch (group) {
+ case RTNLGRP_IPV4_MROUTE_R:
+ case RTNLGRP_IPV6_MROUTE_R:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ break;
+ }
+ return 0;
+}
+
+static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ switch (event) {
+ case NETDEV_REBOOT:
+ case NETDEV_CHANGEMTU:
+ case NETDEV_CHANGEADDR:
+ case NETDEV_CHANGENAME:
+ case NETDEV_FEAT_CHANGE:
+ case NETDEV_BONDING_FAILOVER:
+ case NETDEV_POST_TYPE_CHANGE:
+ case NETDEV_NOTIFY_PEERS:
+ case NETDEV_CHANGEUPPER:
+ case NETDEV_RESEND_IGMP:
+ case NETDEV_CHANGEINFODATA:
+ case NETDEV_CHANGELOWERSTATE:
+ case NETDEV_CHANGE_TX_QUEUE_LEN:
+ rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
+ GFP_KERNEL, NULL, 0);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block rtnetlink_dev_notifier = {
+ .notifier_call = rtnetlink_event,
+};
+
+
+static int __net_init rtnetlink_net_init(struct net *net)
+{
+ struct sock *sk;
+ struct netlink_kernel_cfg cfg = {
+ .groups = RTNLGRP_MAX,
+ .input = rtnetlink_rcv,
+ .cb_mutex = &rtnl_mutex,
+ .flags = NL_CFG_F_NONROOT_RECV,
+ .bind = rtnetlink_bind,
+ };
+
+ sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);
+ if (!sk)
+ return -ENOMEM;
+ net->rtnl = sk;
+ return 0;
+}
+
+static void __net_exit rtnetlink_net_exit(struct net *net)
+{
+ netlink_kernel_release(net->rtnl);
+ net->rtnl = NULL;
+}
+
+static struct pernet_operations rtnetlink_net_ops = {
+ .init = rtnetlink_net_init,
+ .exit = rtnetlink_net_exit,
+};
+
+void __init rtnetlink_init(void)
+{
+ if (register_pernet_subsys(&rtnetlink_net_ops))
+ panic("rtnetlink_init: cannot initialize rtnetlink\n");
+
+ register_netdevice_notifier(&rtnetlink_dev_notifier);
+
+ rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
+ rtnl_dump_ifinfo, 0);
+ rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0);
+
+ rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0);
+
+ rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0);
+ rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0);
+ rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, 0);
+
+ rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0);
+ rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0);
+ rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0);
+
+ rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump,
+ 0);
+}
diff --git a/net/core/scm.c b/net/core/scm.c
new file mode 100644
index 000000000..b1ff8a441
--- /dev/null
+++ b/net/core/scm.c
@@ -0,0 +1,350 @@
+/* scm.c - Socket level control messages processing.
+ *
+ * Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Alignment and value checking mods by Craig Metz
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/signal.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/sched/user.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/file.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/security.h>
+#include <linux/pid_namespace.h>
+#include <linux/pid.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+
+#include <linux/uaccess.h>
+
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/compat.h>
+#include <net/scm.h>
+#include <net/cls_cgroup.h>
+
+
+/*
+ * Only allow a user to send credentials, that they could set with
+ * setu(g)id.
+ */
+
+static __inline__ int scm_check_creds(struct ucred *creds)
+{
+ const struct cred *cred = current_cred();
+ kuid_t uid = make_kuid(cred->user_ns, creds->uid);
+ kgid_t gid = make_kgid(cred->user_ns, creds->gid);
+
+ if (!uid_valid(uid) || !gid_valid(gid))
+ return -EINVAL;
+
+ if ((creds->pid == task_tgid_vnr(current) ||
+ ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) &&
+ ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) ||
+ uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) &&
+ ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) ||
+ gid_eq(gid, cred->sgid)) || ns_capable(cred->user_ns, CAP_SETGID))) {
+ return 0;
+ }
+ return -EPERM;
+}
+
+static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
+{
+ int *fdp = (int*)CMSG_DATA(cmsg);
+ struct scm_fp_list *fpl = *fplp;
+ struct file **fpp;
+ int i, num;
+
+ num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int);
+
+ if (num <= 0)
+ return 0;
+
+ if (num > SCM_MAX_FD)
+ return -EINVAL;
+
+ if (!fpl)
+ {
+ fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
+ if (!fpl)
+ return -ENOMEM;
+ *fplp = fpl;
+ fpl->count = 0;
+ fpl->max = SCM_MAX_FD;
+ fpl->user = NULL;
+ }
+ fpp = &fpl->fp[fpl->count];
+
+ if (fpl->count + num > fpl->max)
+ return -EINVAL;
+
+ /*
+ * Verify the descriptors and increment the usage count.
+ */
+
+ for (i=0; i< num; i++)
+ {
+ int fd = fdp[i];
+ struct file *file;
+
+ if (fd < 0 || !(file = fget_raw(fd)))
+ return -EBADF;
+ *fpp++ = file;
+ fpl->count++;
+ }
+
+ if (!fpl->user)
+ fpl->user = get_uid(current_user());
+
+ return num;
+}
+
+void __scm_destroy(struct scm_cookie *scm)
+{
+ struct scm_fp_list *fpl = scm->fp;
+ int i;
+
+ if (fpl) {
+ scm->fp = NULL;
+ for (i=fpl->count-1; i>=0; i--)
+ fput(fpl->fp[i]);
+ free_uid(fpl->user);
+ kfree(fpl);
+ }
+}
+EXPORT_SYMBOL(__scm_destroy);
+
+int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
+{
+ struct cmsghdr *cmsg;
+ int err;
+
+ for_each_cmsghdr(cmsg, msg) {
+ err = -EINVAL;
+
+ /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */
+ /* The first check was omitted in <= 2.2.5. The reasoning was
+ that parser checks cmsg_len in any case, so that
+ additional check would be work duplication.
+ But if cmsg_level is not SOL_SOCKET, we do not check
+ for too short ancillary data object at all! Oops.
+ OK, let's add it...
+ */
+ if (!CMSG_OK(msg, cmsg))
+ goto error;
+
+ if (cmsg->cmsg_level != SOL_SOCKET)
+ continue;
+
+ switch (cmsg->cmsg_type)
+ {
+ case SCM_RIGHTS:
+ if (!sock->ops || sock->ops->family != PF_UNIX)
+ goto error;
+ err=scm_fp_copy(cmsg, &p->fp);
+ if (err<0)
+ goto error;
+ break;
+ case SCM_CREDENTIALS:
+ {
+ struct ucred creds;
+ kuid_t uid;
+ kgid_t gid;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred)))
+ goto error;
+ memcpy(&creds, CMSG_DATA(cmsg), sizeof(struct ucred));
+ err = scm_check_creds(&creds);
+ if (err)
+ goto error;
+
+ p->creds.pid = creds.pid;
+ if (!p->pid || pid_vnr(p->pid) != creds.pid) {
+ struct pid *pid;
+ err = -ESRCH;
+ pid = find_get_pid(creds.pid);
+ if (!pid)
+ goto error;
+ put_pid(p->pid);
+ p->pid = pid;
+ }
+
+ err = -EINVAL;
+ uid = make_kuid(current_user_ns(), creds.uid);
+ gid = make_kgid(current_user_ns(), creds.gid);
+ if (!uid_valid(uid) || !gid_valid(gid))
+ goto error;
+
+ p->creds.uid = uid;
+ p->creds.gid = gid;
+ break;
+ }
+ default:
+ goto error;
+ }
+ }
+
+ if (p->fp && !p->fp->count)
+ {
+ kfree(p->fp);
+ p->fp = NULL;
+ }
+ return 0;
+
+error:
+ scm_destroy(p);
+ return err;
+}
+EXPORT_SYMBOL(__scm_send);
+
+int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
+{
+ struct cmsghdr __user *cm
+ = (__force struct cmsghdr __user *)msg->msg_control;
+ struct cmsghdr cmhdr;
+ int cmlen = CMSG_LEN(len);
+ int err;
+
+ if (MSG_CMSG_COMPAT & msg->msg_flags)
+ return put_cmsg_compat(msg, level, type, len, data);
+
+ if (cm==NULL || msg->msg_controllen < sizeof(*cm)) {
+ msg->msg_flags |= MSG_CTRUNC;
+ return 0; /* XXX: return error? check spec. */
+ }
+ if (msg->msg_controllen < cmlen) {
+ msg->msg_flags |= MSG_CTRUNC;
+ cmlen = msg->msg_controllen;
+ }
+ cmhdr.cmsg_level = level;
+ cmhdr.cmsg_type = type;
+ cmhdr.cmsg_len = cmlen;
+
+ err = -EFAULT;
+ if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
+ goto out;
+ if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
+ goto out;
+ cmlen = CMSG_SPACE(len);
+ if (msg->msg_controllen < cmlen)
+ cmlen = msg->msg_controllen;
+ msg->msg_control += cmlen;
+ msg->msg_controllen -= cmlen;
+ err = 0;
+out:
+ return err;
+}
+EXPORT_SYMBOL(put_cmsg);
+
+void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct cmsghdr __user *cm
+ = (__force struct cmsghdr __user*)msg->msg_control;
+
+ int fdmax = 0;
+ int fdnum = scm->fp->count;
+ struct file **fp = scm->fp->fp;
+ int __user *cmfptr;
+ int err = 0, i;
+
+ if (MSG_CMSG_COMPAT & msg->msg_flags) {
+ scm_detach_fds_compat(msg, scm);
+ return;
+ }
+
+ if (msg->msg_controllen > sizeof(struct cmsghdr))
+ fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr))
+ / sizeof(int));
+
+ if (fdnum < fdmax)
+ fdmax = fdnum;
+
+ for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
+ i++, cmfptr++)
+ {
+ struct socket *sock;
+ int new_fd;
+ err = security_file_receive(fp[i]);
+ if (err)
+ break;
+ err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags
+ ? O_CLOEXEC : 0);
+ if (err < 0)
+ break;
+ new_fd = err;
+ err = put_user(new_fd, cmfptr);
+ if (err) {
+ put_unused_fd(new_fd);
+ break;
+ }
+ /* Bump the usage count and install the file. */
+ sock = sock_from_file(fp[i], &err);
+ if (sock) {
+ sock_update_netprioidx(&sock->sk->sk_cgrp_data);
+ sock_update_classid(&sock->sk->sk_cgrp_data);
+ }
+ fd_install(new_fd, get_file(fp[i]));
+ }
+
+ if (i > 0)
+ {
+ int cmlen = CMSG_LEN(i*sizeof(int));
+ err = put_user(SOL_SOCKET, &cm->cmsg_level);
+ if (!err)
+ err = put_user(SCM_RIGHTS, &cm->cmsg_type);
+ if (!err)
+ err = put_user(cmlen, &cm->cmsg_len);
+ if (!err) {
+ cmlen = CMSG_SPACE(i*sizeof(int));
+ if (msg->msg_controllen < cmlen)
+ cmlen = msg->msg_controllen;
+ msg->msg_control += cmlen;
+ msg->msg_controllen -= cmlen;
+ }
+ }
+ if (i < fdnum || (fdnum && fdmax <= 0))
+ msg->msg_flags |= MSG_CTRUNC;
+
+ /*
+ * All of the files that fit in the message have had their
+ * usage counts incremented, so we just free the list.
+ */
+ __scm_destroy(scm);
+}
+EXPORT_SYMBOL(scm_detach_fds);
+
+struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
+{
+ struct scm_fp_list *new_fpl;
+ int i;
+
+ if (!fpl)
+ return NULL;
+
+ new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
+ GFP_KERNEL);
+ if (new_fpl) {
+ for (i = 0; i < fpl->count; i++)
+ get_file(fpl->fp[i]);
+ new_fpl->max = new_fpl->count;
+ new_fpl->user = get_uid(fpl->user);
+ }
+ return new_fpl;
+}
+EXPORT_SYMBOL(scm_fp_dup);
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
new file mode 100644
index 000000000..6fd25279b
--- /dev/null
+++ b/net/core/secure_seq.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/cryptohash.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/random.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include <linux/string.h>
+#include <linux/net.h>
+#include <linux/siphash.h>
+#include <net/secure_seq.h>
+
+#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
+#include <linux/in6.h>
+#include <net/tcp.h>
+
+static siphash_key_t net_secret __read_mostly;
+static siphash_key_t ts_secret __read_mostly;
+
+#define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ)
+
+static __always_inline void net_secret_init(void)
+{
+ net_get_random_once(&net_secret, sizeof(net_secret));
+}
+
+static __always_inline void ts_secret_init(void)
+{
+ net_get_random_once(&ts_secret, sizeof(ts_secret));
+}
+#endif
+
+#ifdef CONFIG_INET
+static u32 seq_scale(u32 seq)
+{
+ /*
+ * As close as possible to RFC 793, which
+ * suggests using a 250 kHz clock.
+ * Further reading shows this assumes 2 Mb/s networks.
+ * For 10 Mb/s Ethernet, a 1 MHz clock is appropriate.
+ * For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but
+ * we also need to limit the resolution so that the u32 seq
+ * overlaps less than one time per MSL (2 minutes).
+ * Choosing a clock of 64 ns period is OK. (period of 274 s)
+ */
+ return seq + (ktime_get_real_ns() >> 6);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+u32 secure_tcpv6_ts_off(const struct net *net,
+ const __be32 *saddr, const __be32 *daddr)
+{
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *(struct in6_addr *)saddr,
+ .daddr = *(struct in6_addr *)daddr,
+ };
+
+ if (net->ipv4.sysctl_tcp_timestamps != 1)
+ return 0;
+
+ ts_secret_init();
+ return siphash(&combined, offsetofend(typeof(combined), daddr),
+ &ts_secret);
+}
+EXPORT_SYMBOL(secure_tcpv6_ts_off);
+
+u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
+ __be16 sport, __be16 dport)
+{
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ __be16 sport;
+ __be16 dport;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *(struct in6_addr *)saddr,
+ .daddr = *(struct in6_addr *)daddr,
+ .sport = sport,
+ .dport = dport
+ };
+ u32 hash;
+
+ net_secret_init();
+ hash = siphash(&combined, offsetofend(typeof(combined), dport),
+ &net_secret);
+ return seq_scale(hash);
+}
+EXPORT_SYMBOL(secure_tcpv6_seq);
+
+u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
+ __be16 dport)
+{
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ unsigned int timeseed;
+ __be16 dport;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *(struct in6_addr *)saddr,
+ .daddr = *(struct in6_addr *)daddr,
+ .timeseed = jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD,
+ .dport = dport,
+ };
+ net_secret_init();
+ return siphash(&combined, offsetofend(typeof(combined), dport),
+ &net_secret);
+}
+EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
+#endif
+
+#ifdef CONFIG_INET
+u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr)
+{
+ if (net->ipv4.sysctl_tcp_timestamps != 1)
+ return 0;
+
+ ts_secret_init();
+ return siphash_2u32((__force u32)saddr, (__force u32)daddr,
+ &ts_secret);
+}
+
+/* secure_tcp_seq_and_tsoff(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d),
+ * but fortunately, `sport' cannot be 0 in any circumstances. If this changes,
+ * it would be easy enough to have the former function use siphash_4u32, passing
+ * the arguments as separate u32.
+ */
+u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport)
+{
+ u32 hash;
+
+ net_secret_init();
+ hash = siphash_3u32((__force u32)saddr, (__force u32)daddr,
+ (__force u32)sport << 16 | (__force u32)dport,
+ &net_secret);
+ return seq_scale(hash);
+}
+EXPORT_SYMBOL_GPL(secure_tcp_seq);
+
+u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
+{
+ net_secret_init();
+ return siphash_4u32((__force u32)saddr, (__force u32)daddr,
+ (__force u16)dport,
+ jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD,
+ &net_secret);
+}
+EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
+#endif
+
+#if IS_ENABLED(CONFIG_IP_DCCP)
+u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport)
+{
+ u64 seq;
+ net_secret_init();
+ seq = siphash_3u32((__force u32)saddr, (__force u32)daddr,
+ (__force u32)sport << 16 | (__force u32)dport,
+ &net_secret);
+ seq += ktime_get_real_ns();
+ seq &= (1ull << 48) - 1;
+ return seq;
+}
+EXPORT_SYMBOL(secure_dccp_sequence_number);
+
+#if IS_ENABLED(CONFIG_IPV6)
+u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
+ __be16 sport, __be16 dport)
+{
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ __be16 sport;
+ __be16 dport;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *(struct in6_addr *)saddr,
+ .daddr = *(struct in6_addr *)daddr,
+ .sport = sport,
+ .dport = dport
+ };
+ u64 seq;
+ net_secret_init();
+ seq = siphash(&combined, offsetofend(typeof(combined), dport),
+ &net_secret);
+ seq += ktime_get_real_ns();
+ seq &= (1ull << 48) - 1;
+ return seq;
+}
+EXPORT_SYMBOL(secure_dccpv6_sequence_number);
+#endif
+#endif
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
new file mode 100644
index 000000000..c623c129d
--- /dev/null
+++ b/net/core/skbuff.c
@@ -0,0 +1,5644 @@
+/*
+ * Routines having to do with the 'struct sk_buff' memory handlers.
+ *
+ * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
+ * Florian La Roche <rzsfl@rz.uni-sb.de>
+ *
+ * Fixes:
+ * Alan Cox : Fixed the worst of the load
+ * balancer bugs.
+ * Dave Platt : Interrupt stacking fix.
+ * Richard Kooijman : Timestamp fixes.
+ * Alan Cox : Changed buffer format.
+ * Alan Cox : destructor hook for AF_UNIX etc.
+ * Linus Torvalds : Better skb_clone.
+ * Alan Cox : Added skb_copy.
+ * Alan Cox : Added all the changed routines Linus
+ * only put in the headers
+ * Ray VanTassle : Fixed --skb->lock in free
+ * Alan Cox : skb_copy copy arp field
+ * Andi Kleen : slabified it.
+ * Robert Olsson : Removed skb_head_pool
+ *
+ * NOTE:
+ * The __skb_ routines should be called with interrupts
+ * disabled, or you better be *real* sure that the operation is atomic
+ * with respect to whatever list is being frobbed (e.g. via lock_sock()
+ * or via disabling bottom half handlers, etc).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * The functions in this file will not compile correctly with gcc 2.4.x
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+#include <linux/netdevice.h>
+#ifdef CONFIG_NET_CLS_ACT
+#include <net/pkt_sched.h>
+#endif
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/splice.h>
+#include <linux/cache.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/scatterlist.h>
+#include <linux/errqueue.h>
+#include <linux/prefetch.h>
+#include <linux/if_vlan.h>
+
+#include <net/protocol.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/ip6_checksum.h>
+#include <net/xfrm.h>
+
+#include <linux/uaccess.h>
+#include <trace/events/skb.h>
+#include <linux/highmem.h>
+#include <linux/capability.h>
+#include <linux/user_namespace.h>
+
+struct kmem_cache *skbuff_head_cache __ro_after_init;
+static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
+int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
+EXPORT_SYMBOL(sysctl_max_skb_frags);
+
+/**
+ * skb_panic - private function for out-of-line support
+ * @skb: buffer
+ * @sz: size
+ * @addr: address
+ * @msg: skb_over_panic or skb_under_panic
+ *
+ * Out-of-line support for skb_put() and skb_push().
+ * Called via the wrapper skb_over_panic() or skb_under_panic().
+ * Keep out of line to prevent kernel bloat.
+ * __builtin_return_address is not used because it is not always reliable.
+ */
+static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
+ const char msg[])
+{
+ pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
+ msg, addr, skb->len, sz, skb->head, skb->data,
+ (unsigned long)skb->tail, (unsigned long)skb->end,
+ skb->dev ? skb->dev->name : "<NULL>");
+ BUG();
+}
+
+static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
+{
+ skb_panic(skb, sz, addr, __func__);
+}
+
+static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
+{
+ skb_panic(skb, sz, addr, __func__);
+}
+
+/*
+ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
+ * the caller if emergency pfmemalloc reserves are being used. If it is and
+ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
+ * may be used. Otherwise, the packet data may be discarded until enough
+ * memory is free
+ */
+#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
+ __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
+
+static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
+ unsigned long ip, bool *pfmemalloc)
+{
+ void *obj;
+ bool ret_pfmemalloc = false;
+
+ /*
+ * Try a regular allocation, when that fails and we're not entitled
+ * to the reserves, fail.
+ */
+ obj = kmalloc_node_track_caller(size,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ goto out;
+
+ /* Try again but now we are using pfmemalloc reserves */
+ ret_pfmemalloc = true;
+ obj = kmalloc_node_track_caller(size, flags, node);
+
+out:
+ if (pfmemalloc)
+ *pfmemalloc = ret_pfmemalloc;
+
+ return obj;
+}
+
+/* Allocate a new skbuff. We do this ourselves so we can fill in a few
+ * 'private' fields and also do memory statistics to find all the
+ * [BEEP] leaks.
+ *
+ */
+
+/**
+ * __alloc_skb - allocate a network buffer
+ * @size: size to allocate
+ * @gfp_mask: allocation mask
+ * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
+ * instead of head cache and allocate a cloned (child) skb.
+ * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ * allocations in case the data is required for writeback
+ * @node: numa node to allocate memory on
+ *
+ * Allocate a new &sk_buff. The returned buffer has no headroom and a
+ * tail room of at least size bytes. The object has a reference count
+ * of one. The return is the buffer. On a failure the return is %NULL.
+ *
+ * Buffers may only be allocated from interrupts using a @gfp_mask of
+ * %GFP_ATOMIC.
+ */
+struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ int flags, int node)
+{
+ struct kmem_cache *cache;
+ struct skb_shared_info *shinfo;
+ struct sk_buff *skb;
+ u8 *data;
+ bool pfmemalloc;
+
+ cache = (flags & SKB_ALLOC_FCLONE)
+ ? skbuff_fclone_cache : skbuff_head_cache;
+
+ if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
+ gfp_mask |= __GFP_MEMALLOC;
+
+ /* Get the HEAD */
+ skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
+ if (!skb)
+ goto out;
+ prefetchw(skb);
+
+ /* We do our best to align skb_shared_info on a separate cache
+ * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
+ * aligned memory blocks, unless SLUB/SLAB debug is enabled.
+ * Both skb->head and skb_shared_info are cache line aligned.
+ */
+ size = SKB_DATA_ALIGN(size);
+ size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
+ if (!data)
+ goto nodata;
+ /* kmalloc(size) might give us more room than requested.
+ * Put skb_shared_info exactly at the end of allocated zone,
+ * to allow max possible filling before reallocation.
+ */
+ size = SKB_WITH_OVERHEAD(ksize(data));
+ prefetchw(data + size);
+
+ /*
+ * Only clear those fields we need to clear, not those that we will
+ * actually initialise below. Hence, don't put any more fields after
+ * the tail pointer in struct sk_buff!
+ */
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ /* Account for allocated memory : skb + skb->head */
+ skb->truesize = SKB_TRUESIZE(size);
+ skb->pfmemalloc = pfmemalloc;
+ refcount_set(&skb->users, 1);
+ skb->head = data;
+ skb->data = data;
+ skb_reset_tail_pointer(skb);
+ skb->end = skb->tail + size;
+ skb->mac_header = (typeof(skb->mac_header))~0U;
+ skb->transport_header = (typeof(skb->transport_header))~0U;
+
+ /* make sure we initialize shinfo sequentially */
+ shinfo = skb_shinfo(skb);
+ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+ atomic_set(&shinfo->dataref, 1);
+
+ if (flags & SKB_ALLOC_FCLONE) {
+ struct sk_buff_fclones *fclones;
+
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
+
+ skb->fclone = SKB_FCLONE_ORIG;
+ refcount_set(&fclones->fclone_ref, 1);
+
+ fclones->skb2.fclone = SKB_FCLONE_CLONE;
+ }
+out:
+ return skb;
+nodata:
+ kmem_cache_free(cache, skb);
+ skb = NULL;
+ goto out;
+}
+EXPORT_SYMBOL(__alloc_skb);
+
+/**
+ * __build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of data, or 0 if head was kmalloced
+ *
+ * Allocate a new &sk_buff. Caller provides space holding head and
+ * skb_shared_info. @data must have been allocated by kmalloc() only if
+ * @frag_size is 0, otherwise data should come from the page allocator
+ * or vmalloc()
+ * The return is the new skb buffer.
+ * On a failure the return is %NULL, and @data is not freed.
+ * Notes :
+ * Before IO, driver allocates only data buffer where NIC put incoming frame
+ * Driver should add room at head (NET_SKB_PAD) and
+ * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
+ * After IO, driver calls build_skb(), to allocate sk_buff and populate it
+ * before giving packet to stack.
+ * RX rings only contains data buffers, not full skbs.
+ */
+struct sk_buff *__build_skb(void *data, unsigned int frag_size)
+{
+ struct skb_shared_info *shinfo;
+ struct sk_buff *skb;
+ unsigned int size = frag_size ? : ksize(data);
+
+ skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->truesize = SKB_TRUESIZE(size);
+ refcount_set(&skb->users, 1);
+ skb->head = data;
+ skb->data = data;
+ skb_reset_tail_pointer(skb);
+ skb->end = skb->tail + size;
+ skb->mac_header = (typeof(skb->mac_header))~0U;
+ skb->transport_header = (typeof(skb->transport_header))~0U;
+
+ /* make sure we initialize shinfo sequentially */
+ shinfo = skb_shinfo(skb);
+ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+ atomic_set(&shinfo->dataref, 1);
+
+ return skb;
+}
+
+/* build_skb() is wrapper over __build_skb(), that specifically
+ * takes care of skb->head and skb->pfmemalloc
+ * This means that if @frag_size is not zero, then @data must be backed
+ * by a page fragment, not kmalloc() or vmalloc()
+ */
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
+{
+ struct sk_buff *skb = __build_skb(data, frag_size);
+
+ if (skb && frag_size) {
+ skb->head_frag = 1;
+ if (page_is_pfmemalloc(virt_to_head_page(data)))
+ skb->pfmemalloc = 1;
+ }
+ return skb;
+}
+EXPORT_SYMBOL(build_skb);
+
+#define NAPI_SKB_CACHE_SIZE 64
+
+struct napi_alloc_cache {
+ struct page_frag_cache page;
+ unsigned int skb_count;
+ void *skb_cache[NAPI_SKB_CACHE_SIZE];
+};
+
+static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
+
+static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+{
+ struct page_frag_cache *nc;
+ unsigned long flags;
+ void *data;
+
+ local_irq_save(flags);
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = page_frag_alloc(nc, fragsz, gfp_mask);
+ local_irq_restore(flags);
+ return data;
+}
+
+/**
+ * netdev_alloc_frag - allocate a page fragment
+ * @fragsz: fragment size
+ *
+ * Allocates a frag from a page for receive buffer.
+ * Uses GFP_ATOMIC allocations.
+ */
+void *netdev_alloc_frag(unsigned int fragsz)
+{
+ fragsz = SKB_DATA_ALIGN(fragsz);
+
+ return __netdev_alloc_frag(fragsz, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(netdev_alloc_frag);
+
+static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+ return page_frag_alloc(&nc->page, fragsz, gfp_mask);
+}
+
+void *napi_alloc_frag(unsigned int fragsz)
+{
+ fragsz = SKB_DATA_ALIGN(fragsz);
+
+ return __napi_alloc_frag(fragsz, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(napi_alloc_frag);
+
+/**
+ * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ * @dev: network device to receive on
+ * @len: length to allocate
+ * @gfp_mask: get_free_pages mask, passed to alloc_skb
+ *
+ * Allocate a new &sk_buff and assign it a usage count of one. The
+ * buffer has NET_SKB_PAD headroom built in. Users should allocate
+ * the headroom they think they need without accounting for the
+ * built in space. The built in space is used for optimisations.
+ *
+ * %NULL is returned if there is no free memory.
+ */
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
+ gfp_t gfp_mask)
+{
+ struct page_frag_cache *nc;
+ unsigned long flags;
+ struct sk_buff *skb;
+ bool pfmemalloc;
+ void *data;
+
+ len += NET_SKB_PAD;
+
+ /* If requested length is either too small or too big,
+ * we use kmalloc() for skb->head allocation.
+ */
+ if (len <= SKB_WITH_OVERHEAD(1024) ||
+ len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
+ (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
+ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+ if (!skb)
+ goto skb_fail;
+ goto skb_success;
+ }
+
+ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ len = SKB_DATA_ALIGN(len);
+
+ if (sk_memalloc_socks())
+ gfp_mask |= __GFP_MEMALLOC;
+
+ local_irq_save(flags);
+
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = page_frag_alloc(nc, len, gfp_mask);
+ pfmemalloc = nc->pfmemalloc;
+
+ local_irq_restore(flags);
+
+ if (unlikely(!data))
+ return NULL;
+
+ skb = __build_skb(data, len);
+ if (unlikely(!skb)) {
+ skb_free_frag(data);
+ return NULL;
+ }
+
+ /* use OR instead of assignment to avoid clearing of bits in mask */
+ if (pfmemalloc)
+ skb->pfmemalloc = 1;
+ skb->head_frag = 1;
+
+skb_success:
+ skb_reserve(skb, NET_SKB_PAD);
+ skb->dev = dev;
+
+skb_fail:
+ return skb;
+}
+EXPORT_SYMBOL(__netdev_alloc_skb);
+
+/**
+ * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
+ * @napi: napi instance this buffer was allocated for
+ * @len: length to allocate
+ * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
+ *
+ * Allocate a new sk_buff for use in NAPI receive. This buffer will
+ * attempt to allocate the head from a special reserved region used
+ * only for NAPI Rx allocation. By doing this we can save several
+ * CPU cycles by avoiding having to disable and re-enable IRQs.
+ *
+ * %NULL is returned if there is no free memory.
+ */
+struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
+ gfp_t gfp_mask)
+{
+ struct napi_alloc_cache *nc;
+ struct sk_buff *skb;
+ void *data;
+
+ len += NET_SKB_PAD + NET_IP_ALIGN;
+
+ /* If requested length is either too small or too big,
+ * we use kmalloc() for skb->head allocation.
+ */
+ if (len <= SKB_WITH_OVERHEAD(1024) ||
+ len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
+ (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
+ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+ if (!skb)
+ goto skb_fail;
+ goto skb_success;
+ }
+
+ nc = this_cpu_ptr(&napi_alloc_cache);
+ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ len = SKB_DATA_ALIGN(len);
+
+ if (sk_memalloc_socks())
+ gfp_mask |= __GFP_MEMALLOC;
+
+ data = page_frag_alloc(&nc->page, len, gfp_mask);
+ if (unlikely(!data))
+ return NULL;
+
+ skb = __build_skb(data, len);
+ if (unlikely(!skb)) {
+ skb_free_frag(data);
+ return NULL;
+ }
+
+ /* use OR instead of assignment to avoid clearing of bits in mask */
+ if (nc->page.pfmemalloc)
+ skb->pfmemalloc = 1;
+ skb->head_frag = 1;
+
+skb_success:
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+ skb->dev = napi->dev;
+
+skb_fail:
+ return skb;
+}
+EXPORT_SYMBOL(__napi_alloc_skb);
+
+void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
+ int size, unsigned int truesize)
+{
+ skb_fill_page_desc(skb, i, page, off, size);
+ skb->len += size;
+ skb->data_len += size;
+ skb->truesize += truesize;
+}
+EXPORT_SYMBOL(skb_add_rx_frag);
+
+void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
+ unsigned int truesize)
+{
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ skb_frag_size_add(frag, size);
+ skb->len += size;
+ skb->data_len += size;
+ skb->truesize += truesize;
+}
+EXPORT_SYMBOL(skb_coalesce_rx_frag);
+
+static void skb_drop_list(struct sk_buff **listp)
+{
+ kfree_skb_list(*listp);
+ *listp = NULL;
+}
+
+static inline void skb_drop_fraglist(struct sk_buff *skb)
+{
+ skb_drop_list(&skb_shinfo(skb)->frag_list);
+}
+
+static void skb_clone_fraglist(struct sk_buff *skb)
+{
+ struct sk_buff *list;
+
+ skb_walk_frags(skb, list)
+ skb_get(list);
+}
+
+static void skb_free_head(struct sk_buff *skb)
+{
+ unsigned char *head = skb->head;
+
+ if (skb->head_frag)
+ skb_free_frag(head);
+ else
+ kfree(head);
+}
+
+static void skb_release_data(struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int i;
+
+ if (skb->cloned &&
+ atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
+ &shinfo->dataref))
+ return;
+
+ for (i = 0; i < shinfo->nr_frags; i++)
+ __skb_frag_unref(&shinfo->frags[i]);
+
+ if (shinfo->frag_list)
+ kfree_skb_list(shinfo->frag_list);
+
+ skb_zcopy_clear(skb, true);
+ skb_free_head(skb);
+}
+
+/*
+ * Free an skbuff by memory without cleaning the state.
+ */
+static void kfree_skbmem(struct sk_buff *skb)
+{
+ struct sk_buff_fclones *fclones;
+
+ switch (skb->fclone) {
+ case SKB_FCLONE_UNAVAILABLE:
+ kmem_cache_free(skbuff_head_cache, skb);
+ return;
+
+ case SKB_FCLONE_ORIG:
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
+
+ /* We usually free the clone (TX completion) before original skb
+ * This test would have no chance to be true for the clone,
+ * while here, branch prediction will be good.
+ */
+ if (refcount_read(&fclones->fclone_ref) == 1)
+ goto fastpath;
+ break;
+
+ default: /* SKB_FCLONE_CLONE */
+ fclones = container_of(skb, struct sk_buff_fclones, skb2);
+ break;
+ }
+ if (!refcount_dec_and_test(&fclones->fclone_ref))
+ return;
+fastpath:
+ kmem_cache_free(skbuff_fclone_cache, fclones);
+}
+
+void skb_release_head_state(struct sk_buff *skb)
+{
+ skb_dst_drop(skb);
+ secpath_reset(skb);
+ if (skb->destructor) {
+ WARN_ON(in_irq());
+ skb->destructor(skb);
+ }
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ nf_conntrack_put(skb_nfct(skb));
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ nf_bridge_put(skb->nf_bridge);
+#endif
+}
+
+/* Free everything but the sk_buff shell. */
+static void skb_release_all(struct sk_buff *skb)
+{
+ skb_release_head_state(skb);
+ if (likely(skb->head))
+ skb_release_data(skb);
+}
+
+/**
+ * __kfree_skb - private function
+ * @skb: buffer
+ *
+ * Free an sk_buff. Release anything attached to the buffer.
+ * Clean the state. This is an internal helper function. Users should
+ * always call kfree_skb
+ */
+
+void __kfree_skb(struct sk_buff *skb)
+{
+ skb_release_all(skb);
+ kfree_skbmem(skb);
+}
+EXPORT_SYMBOL(__kfree_skb);
+
+/**
+ * kfree_skb - free an sk_buff
+ * @skb: buffer to free
+ *
+ * Drop a reference to the buffer and free it if the usage count has
+ * hit zero.
+ */
+void kfree_skb(struct sk_buff *skb)
+{
+ if (!skb_unref(skb))
+ return;
+
+ trace_kfree_skb(skb, __builtin_return_address(0));
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(kfree_skb);
+
+void kfree_skb_list(struct sk_buff *segs)
+{
+ while (segs) {
+ struct sk_buff *next = segs->next;
+
+ kfree_skb(segs);
+ segs = next;
+ }
+}
+EXPORT_SYMBOL(kfree_skb_list);
+
+/**
+ * skb_tx_error - report an sk_buff xmit error
+ * @skb: buffer that triggered an error
+ *
+ * Report xmit error if a device callback is tracking this skb.
+ * skb must be freed afterwards.
+ */
+void skb_tx_error(struct sk_buff *skb)
+{
+ skb_zcopy_clear(skb, true);
+}
+EXPORT_SYMBOL(skb_tx_error);
+
+/**
+ * consume_skb - free an skbuff
+ * @skb: buffer to free
+ *
+ * Drop a ref to the buffer and free it if the usage count has hit zero
+ * Functions identically to kfree_skb, but kfree_skb assumes that the frame
+ * is being dropped after a failure and notes that
+ */
+void consume_skb(struct sk_buff *skb)
+{
+ if (!skb_unref(skb))
+ return;
+
+ trace_consume_skb(skb);
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(consume_skb);
+
+/**
+ * consume_stateless_skb - free an skbuff, assuming it is stateless
+ * @skb: buffer to free
+ *
+ * Alike consume_skb(), but this variant assumes that this is the last
+ * skb reference and all the head states have been already dropped
+ */
+void __consume_stateless_skb(struct sk_buff *skb)
+{
+ trace_consume_skb(skb);
+ skb_release_data(skb);
+ kfree_skbmem(skb);
+}
+
+void __kfree_skb_flush(void)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+ /* flush skb_cache if containing objects */
+ if (nc->skb_count) {
+ kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
+ nc->skb_cache);
+ nc->skb_count = 0;
+ }
+}
+
+static inline void _kfree_skb_defer(struct sk_buff *skb)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+ /* drop skb->head and call any destructors for packet */
+ skb_release_all(skb);
+
+ /* record skb to CPU local list */
+ nc->skb_cache[nc->skb_count++] = skb;
+
+#ifdef CONFIG_SLUB
+ /* SLUB writes into objects when freeing */
+ prefetchw(skb);
+#endif
+
+ /* flush skb_cache if it is filled */
+ if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
+ kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
+ nc->skb_cache);
+ nc->skb_count = 0;
+ }
+}
+void __kfree_skb_defer(struct sk_buff *skb)
+{
+ _kfree_skb_defer(skb);
+}
+
+void napi_consume_skb(struct sk_buff *skb, int budget)
+{
+ if (unlikely(!skb))
+ return;
+
+ /* Zero budget indicate non-NAPI context called us, like netpoll */
+ if (unlikely(!budget)) {
+ dev_consume_skb_any(skb);
+ return;
+ }
+
+ if (!skb_unref(skb))
+ return;
+
+ /* if reaching here SKB is ready to free */
+ trace_consume_skb(skb);
+
+ /* if SKB is a clone, don't handle this case */
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
+ __kfree_skb(skb);
+ return;
+ }
+
+ _kfree_skb_defer(skb);
+}
+EXPORT_SYMBOL(napi_consume_skb);
+
+/* Make sure a field is enclosed inside headers_start/headers_end section */
+#define CHECK_SKB_FIELD(field) \
+ BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
+ offsetof(struct sk_buff, headers_start)); \
+ BUILD_BUG_ON(offsetof(struct sk_buff, field) > \
+ offsetof(struct sk_buff, headers_end)); \
+
+static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+{
+ new->tstamp = old->tstamp;
+ /* We do not copy old->sk */
+ new->dev = old->dev;
+ memcpy(new->cb, old->cb, sizeof(old->cb));
+ skb_dst_copy(new, old);
+#ifdef CONFIG_XFRM
+ new->sp = secpath_get(old->sp);
+#endif
+ __nf_copy(new, old, false);
+
+ /* Note : this field could be in headers_start/headers_end section
+ * It is not yet because we do not want to have a 16 bit hole
+ */
+ new->queue_mapping = old->queue_mapping;
+
+ memcpy(&new->headers_start, &old->headers_start,
+ offsetof(struct sk_buff, headers_end) -
+ offsetof(struct sk_buff, headers_start));
+ CHECK_SKB_FIELD(protocol);
+ CHECK_SKB_FIELD(csum);
+ CHECK_SKB_FIELD(hash);
+ CHECK_SKB_FIELD(priority);
+ CHECK_SKB_FIELD(skb_iif);
+ CHECK_SKB_FIELD(vlan_proto);
+ CHECK_SKB_FIELD(vlan_tci);
+ CHECK_SKB_FIELD(transport_header);
+ CHECK_SKB_FIELD(network_header);
+ CHECK_SKB_FIELD(mac_header);
+ CHECK_SKB_FIELD(inner_protocol);
+ CHECK_SKB_FIELD(inner_transport_header);
+ CHECK_SKB_FIELD(inner_network_header);
+ CHECK_SKB_FIELD(inner_mac_header);
+ CHECK_SKB_FIELD(mark);
+#ifdef CONFIG_NETWORK_SECMARK
+ CHECK_SKB_FIELD(secmark);
+#endif
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ CHECK_SKB_FIELD(napi_id);
+#endif
+#ifdef CONFIG_XPS
+ CHECK_SKB_FIELD(sender_cpu);
+#endif
+#ifdef CONFIG_NET_SCHED
+ CHECK_SKB_FIELD(tc_index);
+#endif
+
+}
+
+/*
+ * You should not add any new code to this function. Add it to
+ * __copy_skb_header above instead.
+ */
+static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
+{
+#define C(x) n->x = skb->x
+
+ n->next = n->prev = NULL;
+ n->sk = NULL;
+ __copy_skb_header(n, skb);
+
+ C(len);
+ C(data_len);
+ C(mac_len);
+ n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
+ n->cloned = 1;
+ n->nohdr = 0;
+ n->peeked = 0;
+ C(pfmemalloc);
+ n->destructor = NULL;
+ C(tail);
+ C(end);
+ C(head);
+ C(head_frag);
+ C(data);
+ C(truesize);
+ refcount_set(&n->users, 1);
+
+ atomic_inc(&(skb_shinfo(skb)->dataref));
+ skb->cloned = 1;
+
+ return n;
+#undef C
+}
+
+/**
+ * skb_morph - morph one skb into another
+ * @dst: the skb to receive the contents
+ * @src: the skb to supply the contents
+ *
+ * This is identical to skb_clone except that the target skb is
+ * supplied by the user.
+ *
+ * The target skb is returned upon exit.
+ */
+struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
+{
+ skb_release_all(dst);
+ return __skb_clone(dst, src);
+}
+EXPORT_SYMBOL_GPL(skb_morph);
+
+int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
+{
+ unsigned long max_pg, num_pg, new_pg, old_pg;
+ struct user_struct *user;
+
+ if (capable(CAP_IPC_LOCK) || !size)
+ return 0;
+
+ num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */
+ max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ user = mmp->user ? : current_user();
+
+ do {
+ old_pg = atomic_long_read(&user->locked_vm);
+ new_pg = old_pg + num_pg;
+ if (new_pg > max_pg)
+ return -ENOBUFS;
+ } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) !=
+ old_pg);
+
+ if (!mmp->user) {
+ mmp->user = get_uid(user);
+ mmp->num_pg = num_pg;
+ } else {
+ mmp->num_pg += num_pg;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
+
+void mm_unaccount_pinned_pages(struct mmpin *mmp)
+{
+ if (mmp->user) {
+ atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
+ free_uid(mmp->user);
+ }
+}
+EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
+
+struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
+{
+ struct ubuf_info *uarg;
+ struct sk_buff *skb;
+
+ WARN_ON_ONCE(!in_task());
+
+ skb = sock_omalloc(sk, 0, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
+ uarg = (void *)skb->cb;
+ uarg->mmp.user = NULL;
+
+ if (mm_account_pinned_pages(&uarg->mmp, size)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ uarg->callback = sock_zerocopy_callback;
+ uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
+ uarg->len = 1;
+ uarg->bytelen = size;
+ uarg->zerocopy = 1;
+ refcount_set(&uarg->refcnt, 1);
+ sock_hold(sk);
+
+ return uarg;
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_alloc);
+
+static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
+{
+ return container_of((void *)uarg, struct sk_buff, cb);
+}
+
+struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
+ struct ubuf_info *uarg)
+{
+ if (uarg) {
+ const u32 byte_limit = 1 << 19; /* limit to a few TSO */
+ u32 bytelen, next;
+
+ /* realloc only when socket is locked (TCP, UDP cork),
+ * so uarg->len and sk_zckey access is serialized
+ */
+ if (!sock_owned_by_user(sk)) {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ bytelen = uarg->bytelen + size;
+ if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
+ /* TCP can create new skb to attach new uarg */
+ if (sk->sk_type == SOCK_STREAM)
+ goto new_alloc;
+ return NULL;
+ }
+
+ next = (u32)atomic_read(&sk->sk_zckey);
+ if ((u32)(uarg->id + uarg->len) == next) {
+ if (mm_account_pinned_pages(&uarg->mmp, size))
+ return NULL;
+ uarg->len++;
+ uarg->bytelen = bytelen;
+ atomic_set(&sk->sk_zckey, ++next);
+ sock_zerocopy_get(uarg);
+ return uarg;
+ }
+ }
+
+new_alloc:
+ return sock_zerocopy_alloc(sk, size);
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
+
+static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
+{
+ struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+ u32 old_lo, old_hi;
+ u64 sum_len;
+
+ old_lo = serr->ee.ee_info;
+ old_hi = serr->ee.ee_data;
+ sum_len = old_hi - old_lo + 1ULL + len;
+
+ if (sum_len >= (1ULL << 32))
+ return false;
+
+ if (lo != old_hi + 1)
+ return false;
+
+ serr->ee.ee_data += len;
+ return true;
+}
+
+void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
+{
+ struct sk_buff *tail, *skb = skb_from_uarg(uarg);
+ struct sock_exterr_skb *serr;
+ struct sock *sk = skb->sk;
+ struct sk_buff_head *q;
+ unsigned long flags;
+ u32 lo, hi;
+ u16 len;
+
+ mm_unaccount_pinned_pages(&uarg->mmp);
+
+ /* if !len, there was only 1 call, and it was aborted
+ * so do not queue a completion notification
+ */
+ if (!uarg->len || sock_flag(sk, SOCK_DEAD))
+ goto release;
+
+ len = uarg->len;
+ lo = uarg->id;
+ hi = uarg->id + len - 1;
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = 0;
+ serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
+ serr->ee.ee_data = hi;
+ serr->ee.ee_info = lo;
+ if (!success)
+ serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
+
+ q = &sk->sk_error_queue;
+ spin_lock_irqsave(&q->lock, flags);
+ tail = skb_peek_tail(q);
+ if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
+ !skb_zerocopy_notify_extend(tail, lo, len)) {
+ __skb_queue_tail(q, skb);
+ skb = NULL;
+ }
+ spin_unlock_irqrestore(&q->lock, flags);
+
+ sk->sk_error_report(sk);
+
+release:
+ consume_skb(skb);
+ sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
+
+void sock_zerocopy_put(struct ubuf_info *uarg)
+{
+ if (uarg && refcount_dec_and_test(&uarg->refcnt)) {
+ if (uarg->callback)
+ uarg->callback(uarg, uarg->zerocopy);
+ else
+ consume_skb(skb_from_uarg(uarg));
+ }
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_put);
+
+void sock_zerocopy_put_abort(struct ubuf_info *uarg)
+{
+ if (uarg) {
+ struct sock *sk = skb_from_uarg(uarg)->sk;
+
+ atomic_dec(&sk->sk_zckey);
+ uarg->len--;
+
+ sock_zerocopy_put(uarg);
+ }
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
+
+extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
+ struct iov_iter *from, size_t length);
+
+int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
+ struct msghdr *msg, int len,
+ struct ubuf_info *uarg)
+{
+ struct ubuf_info *orig_uarg = skb_zcopy(skb);
+ struct iov_iter orig_iter = msg->msg_iter;
+ int err, orig_len = skb->len;
+
+ /* An skb can only point to one uarg. This edge case happens when
+ * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
+ */
+ if (orig_uarg && uarg != orig_uarg)
+ return -EEXIST;
+
+ err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
+ if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
+ struct sock *save_sk = skb->sk;
+
+ /* Streams do not free skb on error. Reset to prev state. */
+ msg->msg_iter = orig_iter;
+ skb->sk = sk;
+ ___pskb_trim(skb, orig_len);
+ skb->sk = save_sk;
+ return err;
+ }
+
+ skb_zcopy_set(skb, uarg);
+ return skb->len - orig_len;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
+
+static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
+ gfp_t gfp_mask)
+{
+ if (skb_zcopy(orig)) {
+ if (skb_zcopy(nskb)) {
+ /* !gfp_mask callers are verified to !skb_zcopy(nskb) */
+ if (!gfp_mask) {
+ WARN_ON_ONCE(1);
+ return -ENOMEM;
+ }
+ if (skb_uarg(nskb) == skb_uarg(orig))
+ return 0;
+ if (skb_copy_ubufs(nskb, GFP_ATOMIC))
+ return -EIO;
+ }
+ skb_zcopy_set(nskb, skb_uarg(orig));
+ }
+ return 0;
+}
+
+/**
+ * skb_copy_ubufs - copy userspace skb frags buffers to kernel
+ * @skb: the skb to modify
+ * @gfp_mask: allocation priority
+ *
+ * This must be called on SKBTX_DEV_ZEROCOPY skb.
+ * It will copy all frags into kernel and drop the reference
+ * to userspace pages.
+ *
+ * If this function is called from an interrupt gfp_mask() must be
+ * %GFP_ATOMIC.
+ *
+ * Returns 0 on success or a negative error code on failure
+ * to allocate kernel memory to copy to.
+ */
+int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
+{
+ int num_frags = skb_shinfo(skb)->nr_frags;
+ struct page *page, *head = NULL;
+ int i, new_frags;
+ u32 d_off;
+
+ if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
+ return -EINVAL;
+
+ if (!num_frags)
+ goto release;
+
+ new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ for (i = 0; i < new_frags; i++) {
+ page = alloc_page(gfp_mask);
+ if (!page) {
+ while (head) {
+ struct page *next = (struct page *)page_private(head);
+ put_page(head);
+ head = next;
+ }
+ return -ENOMEM;
+ }
+ set_page_private(page, (unsigned long)head);
+ head = page;
+ }
+
+ page = head;
+ d_off = 0;
+ for (i = 0; i < num_frags; i++) {
+ skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ skb_frag_foreach_page(f, f->page_offset, skb_frag_size(f),
+ p, p_off, p_len, copied) {
+ u32 copy, done = 0;
+ vaddr = kmap_atomic(p);
+
+ while (done < p_len) {
+ if (d_off == PAGE_SIZE) {
+ d_off = 0;
+ page = (struct page *)page_private(page);
+ }
+ copy = min_t(u32, PAGE_SIZE - d_off, p_len - done);
+ memcpy(page_address(page) + d_off,
+ vaddr + p_off + done, copy);
+ done += copy;
+ d_off += copy;
+ }
+ kunmap_atomic(vaddr);
+ }
+ }
+
+ /* skb frags release userspace buffers */
+ for (i = 0; i < num_frags; i++)
+ skb_frag_unref(skb, i);
+
+ /* skb frags point to kernel buffers */
+ for (i = 0; i < new_frags - 1; i++) {
+ __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE);
+ head = (struct page *)page_private(head);
+ }
+ __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
+ skb_shinfo(skb)->nr_frags = new_frags;
+
+release:
+ skb_zcopy_clear(skb, false);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_copy_ubufs);
+
+/**
+ * skb_clone - duplicate an sk_buff
+ * @skb: buffer to clone
+ * @gfp_mask: allocation priority
+ *
+ * Duplicate an &sk_buff. The new one is not owned by a socket. Both
+ * copies share the same packet data but not structure. The new
+ * buffer has a reference count of 1. If the allocation fails the
+ * function returns %NULL otherwise the new buffer is returned.
+ *
+ * If this function is called from an interrupt gfp_mask() must be
+ * %GFP_ATOMIC.
+ */
+
+struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
+{
+ struct sk_buff_fclones *fclones = container_of(skb,
+ struct sk_buff_fclones,
+ skb1);
+ struct sk_buff *n;
+
+ if (skb_orphan_frags(skb, gfp_mask))
+ return NULL;
+
+ if (skb->fclone == SKB_FCLONE_ORIG &&
+ refcount_read(&fclones->fclone_ref) == 1) {
+ n = &fclones->skb2;
+ refcount_set(&fclones->fclone_ref, 2);
+ } else {
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+
+ n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+ if (!n)
+ return NULL;
+
+ n->fclone = SKB_FCLONE_UNAVAILABLE;
+ }
+
+ return __skb_clone(n, skb);
+}
+EXPORT_SYMBOL(skb_clone);
+
+void skb_headers_offset_update(struct sk_buff *skb, int off)
+{
+ /* Only adjust this if it actually is csum_start rather than csum */
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ skb->csum_start += off;
+ /* {transport,network,mac}_header and tail are relative to skb->head */
+ skb->transport_header += off;
+ skb->network_header += off;
+ if (skb_mac_header_was_set(skb))
+ skb->mac_header += off;
+ skb->inner_transport_header += off;
+ skb->inner_network_header += off;
+ skb->inner_mac_header += off;
+}
+EXPORT_SYMBOL(skb_headers_offset_update);
+
+void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
+{
+ __copy_skb_header(new, old);
+
+ skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
+ skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
+ skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
+}
+EXPORT_SYMBOL(skb_copy_header);
+
+static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
+{
+ if (skb_pfmemalloc(skb))
+ return SKB_ALLOC_RX;
+ return 0;
+}
+
+/**
+ * skb_copy - create private copy of an sk_buff
+ * @skb: buffer to copy
+ * @gfp_mask: allocation priority
+ *
+ * Make a copy of both an &sk_buff and its data. This is used when the
+ * caller wishes to modify the data and needs a private copy of the
+ * data to alter. Returns %NULL on failure or the pointer to the buffer
+ * on success. The returned buffer has a reference count of 1.
+ *
+ * As by-product this function converts non-linear &sk_buff to linear
+ * one, so that &sk_buff becomes completely private and caller is allowed
+ * to modify all the data of returned buffer. This means that this
+ * function is not recommended for use in circumstances when only
+ * header is going to be modified. Use pskb_copy() instead.
+ */
+
+struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
+{
+ int headerlen = skb_headroom(skb);
+ unsigned int size = skb_end_offset(skb) + skb->data_len;
+ struct sk_buff *n = __alloc_skb(size, gfp_mask,
+ skb_alloc_rx_flag(skb), NUMA_NO_NODE);
+
+ if (!n)
+ return NULL;
+
+ /* Set the data pointer */
+ skb_reserve(n, headerlen);
+ /* Set the tail pointer and length */
+ skb_put(n, skb->len);
+
+ BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
+
+ skb_copy_header(n, skb);
+ return n;
+}
+EXPORT_SYMBOL(skb_copy);
+
+/**
+ * __pskb_copy_fclone - create copy of an sk_buff with private head.
+ * @skb: buffer to copy
+ * @headroom: headroom of new skb
+ * @gfp_mask: allocation priority
+ * @fclone: if true allocate the copy of the skb from the fclone
+ * cache instead of the head cache; it is recommended to set this
+ * to true for the cases where the copy will likely be cloned
+ *
+ * Make a copy of both an &sk_buff and part of its data, located
+ * in header. Fragmented data remain shared. This is used when
+ * the caller wishes to modify only header of &sk_buff and needs
+ * private copy of the header to alter. Returns %NULL on failure
+ * or the pointer to the buffer on success.
+ * The returned buffer has a reference count of 1.
+ */
+
+struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
+ gfp_t gfp_mask, bool fclone)
+{
+ unsigned int size = skb_headlen(skb) + headroom;
+ int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
+ struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
+
+ if (!n)
+ goto out;
+
+ /* Set the data pointer */
+ skb_reserve(n, headroom);
+ /* Set the tail pointer and length */
+ skb_put(n, skb_headlen(skb));
+ /* Copy the bytes */
+ skb_copy_from_linear_data(skb, n->data, n->len);
+
+ n->truesize += skb->data_len;
+ n->data_len = skb->data_len;
+ n->len = skb->len;
+
+ if (skb_shinfo(skb)->nr_frags) {
+ int i;
+
+ if (skb_orphan_frags(skb, gfp_mask) ||
+ skb_zerocopy_clone(n, skb, gfp_mask)) {
+ kfree_skb(n);
+ n = NULL;
+ goto out;
+ }
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
+ skb_frag_ref(skb, i);
+ }
+ skb_shinfo(n)->nr_frags = i;
+ }
+
+ if (skb_has_frag_list(skb)) {
+ skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
+ skb_clone_fraglist(n);
+ }
+
+ skb_copy_header(n, skb);
+out:
+ return n;
+}
+EXPORT_SYMBOL(__pskb_copy_fclone);
+
+/**
+ * pskb_expand_head - reallocate header of &sk_buff
+ * @skb: buffer to reallocate
+ * @nhead: room to add at head
+ * @ntail: room to add at tail
+ * @gfp_mask: allocation priority
+ *
+ * Expands (or creates identical copy, if @nhead and @ntail are zero)
+ * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
+ * reference count of 1. Returns zero in the case of success or error,
+ * if expansion failed. In the last case, &sk_buff is not changed.
+ *
+ * All the pointers pointing into skb header may change and must be
+ * reloaded after call to this function.
+ */
+
+int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+ gfp_t gfp_mask)
+{
+ int i, osize = skb_end_offset(skb);
+ int size = osize + nhead + ntail;
+ long off;
+ u8 *data;
+
+ BUG_ON(nhead < 0);
+
+ BUG_ON(skb_shared(skb));
+
+ size = SKB_DATA_ALIGN(size);
+
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+ data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+ gfp_mask, NUMA_NO_NODE, NULL);
+ if (!data)
+ goto nodata;
+ size = SKB_WITH_OVERHEAD(ksize(data));
+
+ /* Copy only real data... and, alas, header. This should be
+ * optimized for the cases when header is void.
+ */
+ memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
+
+ memcpy((struct skb_shared_info *)(data + size),
+ skb_shinfo(skb),
+ offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
+
+ /*
+ * if shinfo is shared we must drop the old head gracefully, but if it
+ * is not we can just drop the old head and let the existing refcount
+ * be since all we did is relocate the values
+ */
+ if (skb_cloned(skb)) {
+ if (skb_orphan_frags(skb, gfp_mask))
+ goto nofrags;
+ if (skb_zcopy(skb))
+ refcount_inc(&skb_uarg(skb)->refcnt);
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_frag_ref(skb, i);
+
+ if (skb_has_frag_list(skb))
+ skb_clone_fraglist(skb);
+
+ skb_release_data(skb);
+ } else {
+ skb_free_head(skb);
+ }
+ off = (data + nhead) - skb->head;
+
+ skb->head = data;
+ skb->head_frag = 0;
+ skb->data += off;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ skb->end = size;
+ off = nhead;
+#else
+ skb->end = skb->head + size;
+#endif
+ skb->tail += off;
+ skb_headers_offset_update(skb, nhead);
+ skb->cloned = 0;
+ skb->hdr_len = 0;
+ skb->nohdr = 0;
+ atomic_set(&skb_shinfo(skb)->dataref, 1);
+
+ skb_metadata_clear(skb);
+
+ /* It is not generally safe to change skb->truesize.
+ * For the moment, we really care of rx path, or
+ * when skb is orphaned (not attached to a socket).
+ */
+ if (!skb->sk || skb->destructor == sock_edemux)
+ skb->truesize += size - osize;
+
+ return 0;
+
+nofrags:
+ kfree(data);
+nodata:
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(pskb_expand_head);
+
+/* Make private copy of skb with writable head and some headroom */
+
+struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
+{
+ struct sk_buff *skb2;
+ int delta = headroom - skb_headroom(skb);
+
+ if (delta <= 0)
+ skb2 = pskb_copy(skb, GFP_ATOMIC);
+ else {
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
+ GFP_ATOMIC)) {
+ kfree_skb(skb2);
+ skb2 = NULL;
+ }
+ }
+ return skb2;
+}
+EXPORT_SYMBOL(skb_realloc_headroom);
+
+/**
+ * skb_copy_expand - copy and expand sk_buff
+ * @skb: buffer to copy
+ * @newheadroom: new free bytes at head
+ * @newtailroom: new free bytes at tail
+ * @gfp_mask: allocation priority
+ *
+ * Make a copy of both an &sk_buff and its data and while doing so
+ * allocate additional space.
+ *
+ * This is used when the caller wishes to modify the data and needs a
+ * private copy of the data to alter as well as more space for new fields.
+ * Returns %NULL on failure or the pointer to the buffer
+ * on success. The returned buffer has a reference count of 1.
+ *
+ * You must pass %GFP_ATOMIC as the allocation priority if this function
+ * is called from an interrupt.
+ */
+struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
+ int newheadroom, int newtailroom,
+ gfp_t gfp_mask)
+{
+ /*
+ * Allocate the copy buffer
+ */
+ struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
+ gfp_mask, skb_alloc_rx_flag(skb),
+ NUMA_NO_NODE);
+ int oldheadroom = skb_headroom(skb);
+ int head_copy_len, head_copy_off;
+
+ if (!n)
+ return NULL;
+
+ skb_reserve(n, newheadroom);
+
+ /* Set the tail pointer and length */
+ skb_put(n, skb->len);
+
+ head_copy_len = oldheadroom;
+ head_copy_off = 0;
+ if (newheadroom <= head_copy_len)
+ head_copy_len = newheadroom;
+ else
+ head_copy_off = newheadroom - head_copy_len;
+
+ /* Copy the linear header and data. */
+ BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
+ skb->len + head_copy_len));
+
+ skb_copy_header(n, skb);
+
+ skb_headers_offset_update(n, newheadroom - oldheadroom);
+
+ return n;
+}
+EXPORT_SYMBOL(skb_copy_expand);
+
+/**
+ * __skb_pad - zero pad the tail of an skb
+ * @skb: buffer to pad
+ * @pad: space to pad
+ * @free_on_error: free buffer on error
+ *
+ * Ensure that a buffer is followed by a padding area that is zero
+ * filled. Used by network drivers which may DMA or transfer data
+ * beyond the buffer end onto the wire.
+ *
+ * May return error in out of memory cases. The skb is freed on error
+ * if @free_on_error is true.
+ */
+
+int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
+{
+ int err;
+ int ntail;
+
+ /* If the skbuff is non linear tailroom is always zero.. */
+ if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
+ memset(skb->data+skb->len, 0, pad);
+ return 0;
+ }
+
+ ntail = skb->data_len + pad - (skb->end - skb->tail);
+ if (likely(skb_cloned(skb) || ntail > 0)) {
+ err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
+ if (unlikely(err))
+ goto free_skb;
+ }
+
+ /* FIXME: The use of this function with non-linear skb's really needs
+ * to be audited.
+ */
+ err = skb_linearize(skb);
+ if (unlikely(err))
+ goto free_skb;
+
+ memset(skb->data + skb->len, 0, pad);
+ return 0;
+
+free_skb:
+ if (free_on_error)
+ kfree_skb(skb);
+ return err;
+}
+EXPORT_SYMBOL(__skb_pad);
+
+/**
+ * pskb_put - add data to the tail of a potentially fragmented buffer
+ * @skb: start of the buffer to use
+ * @tail: tail fragment of the buffer to use
+ * @len: amount of data to add
+ *
+ * This function extends the used data area of the potentially
+ * fragmented buffer. @tail must be the last fragment of @skb -- or
+ * @skb itself. If this would exceed the total buffer size the kernel
+ * will panic. A pointer to the first byte of the extra data is
+ * returned.
+ */
+
+void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
+{
+ if (tail != skb) {
+ skb->data_len += len;
+ skb->len += len;
+ }
+ return skb_put(tail, len);
+}
+EXPORT_SYMBOL_GPL(pskb_put);
+
+/**
+ * skb_put - add data to a buffer
+ * @skb: buffer to use
+ * @len: amount of data to add
+ *
+ * This function extends the used data area of the buffer. If this would
+ * exceed the total buffer size the kernel will panic. A pointer to the
+ * first byte of the extra data is returned.
+ */
+void *skb_put(struct sk_buff *skb, unsigned int len)
+{
+ void *tmp = skb_tail_pointer(skb);
+ SKB_LINEAR_ASSERT(skb);
+ skb->tail += len;
+ skb->len += len;
+ if (unlikely(skb->tail > skb->end))
+ skb_over_panic(skb, len, __builtin_return_address(0));
+ return tmp;
+}
+EXPORT_SYMBOL(skb_put);
+
+/**
+ * skb_push - add data to the start of a buffer
+ * @skb: buffer to use
+ * @len: amount of data to add
+ *
+ * This function extends the used data area of the buffer at the buffer
+ * start. If this would exceed the total buffer headroom the kernel will
+ * panic. A pointer to the first byte of the extra data is returned.
+ */
+void *skb_push(struct sk_buff *skb, unsigned int len)
+{
+ skb->data -= len;
+ skb->len += len;
+ if (unlikely(skb->data < skb->head))
+ skb_under_panic(skb, len, __builtin_return_address(0));
+ return skb->data;
+}
+EXPORT_SYMBOL(skb_push);
+
+/**
+ * skb_pull - remove data from the start of a buffer
+ * @skb: buffer to use
+ * @len: amount of data to remove
+ *
+ * This function removes data from the start of a buffer, returning
+ * the memory to the headroom. A pointer to the next data in the buffer
+ * is returned. Once the data has been pulled future pushes will overwrite
+ * the old data.
+ */
+void *skb_pull(struct sk_buff *skb, unsigned int len)
+{
+ return skb_pull_inline(skb, len);
+}
+EXPORT_SYMBOL(skb_pull);
+
+/**
+ * skb_trim - remove end from a buffer
+ * @skb: buffer to alter
+ * @len: new length
+ *
+ * Cut the length of a buffer down by removing data from the tail. If
+ * the buffer is already under the length specified it is not modified.
+ * The skb must be linear.
+ */
+void skb_trim(struct sk_buff *skb, unsigned int len)
+{
+ if (skb->len > len)
+ __skb_trim(skb, len);
+}
+EXPORT_SYMBOL(skb_trim);
+
+/* Trims skb to length len. It can change skb pointers.
+ */
+
+int ___pskb_trim(struct sk_buff *skb, unsigned int len)
+{
+ struct sk_buff **fragp;
+ struct sk_buff *frag;
+ int offset = skb_headlen(skb);
+ int nfrags = skb_shinfo(skb)->nr_frags;
+ int i;
+ int err;
+
+ if (skb_cloned(skb) &&
+ unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
+ return err;
+
+ i = 0;
+ if (offset >= len)
+ goto drop_pages;
+
+ for (; i < nfrags; i++) {
+ int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (end < len) {
+ offset = end;
+ continue;
+ }
+
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
+
+drop_pages:
+ skb_shinfo(skb)->nr_frags = i;
+
+ for (; i < nfrags; i++)
+ skb_frag_unref(skb, i);
+
+ if (skb_has_frag_list(skb))
+ skb_drop_fraglist(skb);
+ goto done;
+ }
+
+ for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
+ fragp = &frag->next) {
+ int end = offset + frag->len;
+
+ if (skb_shared(frag)) {
+ struct sk_buff *nfrag;
+
+ nfrag = skb_clone(frag, GFP_ATOMIC);
+ if (unlikely(!nfrag))
+ return -ENOMEM;
+
+ nfrag->next = frag->next;
+ consume_skb(frag);
+ frag = nfrag;
+ *fragp = frag;
+ }
+
+ if (end < len) {
+ offset = end;
+ continue;
+ }
+
+ if (end > len &&
+ unlikely((err = pskb_trim(frag, len - offset))))
+ return err;
+
+ if (frag->next)
+ skb_drop_list(&frag->next);
+ break;
+ }
+
+done:
+ if (len > skb_headlen(skb)) {
+ skb->data_len -= skb->len - len;
+ skb->len = len;
+ } else {
+ skb->len = len;
+ skb->data_len = 0;
+ skb_set_tail_pointer(skb, len);
+ }
+
+ if (!skb->sk || skb->destructor == sock_edemux)
+ skb_condense(skb);
+ return 0;
+}
+EXPORT_SYMBOL(___pskb_trim);
+
+/* Note : use pskb_trim_rcsum() instead of calling this directly
+ */
+int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
+{
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ int delta = skb->len - len;
+
+ skb->csum = csum_block_sub(skb->csum,
+ skb_checksum(skb, len, delta, 0),
+ len);
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
+ int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
+
+ if (offset + sizeof(__sum16) > hdlen)
+ return -EINVAL;
+ }
+ return __pskb_trim(skb, len);
+}
+EXPORT_SYMBOL(pskb_trim_rcsum_slow);
+
+/**
+ * __pskb_pull_tail - advance tail of skb header
+ * @skb: buffer to reallocate
+ * @delta: number of bytes to advance tail
+ *
+ * The function makes a sense only on a fragmented &sk_buff,
+ * it expands header moving its tail forward and copying necessary
+ * data from fragmented part.
+ *
+ * &sk_buff MUST have reference count of 1.
+ *
+ * Returns %NULL (and &sk_buff does not change) if pull failed
+ * or value of new tail of skb in the case of success.
+ *
+ * All the pointers pointing into skb header may change and must be
+ * reloaded after call to this function.
+ */
+
+/* Moves tail of skb head forward, copying data from fragmented part,
+ * when it is necessary.
+ * 1. It may fail due to malloc failure.
+ * 2. It may change skb pointers.
+ *
+ * It is pretty complicated. Luckily, it is called only in exceptional cases.
+ */
+void *__pskb_pull_tail(struct sk_buff *skb, int delta)
+{
+ /* If skb has not enough free space at tail, get new one
+ * plus 128 bytes for future expansions. If we have enough
+ * room at tail, reallocate without expansion only if skb is cloned.
+ */
+ int i, k, eat = (skb->tail + delta) - skb->end;
+
+ if (eat > 0 || skb_cloned(skb)) {
+ if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
+ GFP_ATOMIC))
+ return NULL;
+ }
+
+ BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
+ skb_tail_pointer(skb), delta));
+
+ /* Optimization: no fragments, no reasons to preestimate
+ * size of pulled pages. Superb.
+ */
+ if (!skb_has_frag_list(skb))
+ goto pull_pages;
+
+ /* Estimate size of pulled pages. */
+ eat = delta;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (size >= eat)
+ goto pull_pages;
+ eat -= size;
+ }
+
+ /* If we need update frag list, we are in troubles.
+ * Certainly, it is possible to add an offset to skb data,
+ * but taking into account that pulling is expected to
+ * be very rare operation, it is worth to fight against
+ * further bloating skb head and crucify ourselves here instead.
+ * Pure masohism, indeed. 8)8)
+ */
+ if (eat) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+ struct sk_buff *clone = NULL;
+ struct sk_buff *insp = NULL;
+
+ do {
+ BUG_ON(!list);
+
+ if (list->len <= eat) {
+ /* Eaten as whole. */
+ eat -= list->len;
+ list = list->next;
+ insp = list;
+ } else {
+ /* Eaten partially. */
+
+ if (skb_shared(list)) {
+ /* Sucks! We need to fork list. :-( */
+ clone = skb_clone(list, GFP_ATOMIC);
+ if (!clone)
+ return NULL;
+ insp = list->next;
+ list = clone;
+ } else {
+ /* This may be pulled without
+ * problems. */
+ insp = list;
+ }
+ if (!pskb_pull(list, eat)) {
+ kfree_skb(clone);
+ return NULL;
+ }
+ break;
+ }
+ } while (eat);
+
+ /* Free pulled out fragments. */
+ while ((list = skb_shinfo(skb)->frag_list) != insp) {
+ skb_shinfo(skb)->frag_list = list->next;
+ consume_skb(list);
+ }
+ /* And insert new clone at head. */
+ if (clone) {
+ clone->next = list;
+ skb_shinfo(skb)->frag_list = clone;
+ }
+ }
+ /* Success! Now we may commit changes to skb data. */
+
+pull_pages:
+ eat = delta;
+ k = 0;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (size <= eat) {
+ skb_frag_unref(skb, i);
+ eat -= size;
+ } else {
+ skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+ if (eat) {
+ skb_shinfo(skb)->frags[k].page_offset += eat;
+ skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
+ if (!i)
+ goto end;
+ eat = 0;
+ }
+ k++;
+ }
+ }
+ skb_shinfo(skb)->nr_frags = k;
+
+end:
+ skb->tail += delta;
+ skb->data_len -= delta;
+
+ if (!skb->data_len)
+ skb_zcopy_clear(skb, false);
+
+ return skb_tail_pointer(skb);
+}
+EXPORT_SYMBOL(__pskb_pull_tail);
+
+/**
+ * skb_copy_bits - copy bits from skb to kernel buffer
+ * @skb: source skb
+ * @offset: offset in source
+ * @to: destination buffer
+ * @len: number of bytes to copy
+ *
+ * Copy the specified number of bytes from the source skb to the
+ * destination buffer.
+ *
+ * CAUTION ! :
+ * If its prototype is ever changed,
+ * check arch/{*}/net/{*}.S files,
+ * since it is called from BPF assembly code.
+ */
+int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
+{
+ int start = skb_headlen(skb);
+ struct sk_buff *frag_iter;
+ int i, copy;
+
+ if (offset > (int)skb->len - len)
+ goto fault;
+
+ /* Copy header. */
+ if ((copy = start - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ skb_copy_from_linear_data_offset(skb, offset, to, copy);
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to += copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(f);
+ if ((copy = end - offset) > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ if (copy > len)
+ copy = len;
+
+ skb_frag_foreach_page(f,
+ f->page_offset + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ memcpy(to + copied, vaddr + p_off, p_len);
+ kunmap_atomic(vaddr);
+ }
+
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_bits(frag_iter, offset - start, to, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to += copy;
+ }
+ start = end;
+ }
+
+ if (!len)
+ return 0;
+
+fault:
+ return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_bits);
+
+/*
+ * Callback from splice_to_pipe(), if we need to release some pages
+ * at the end of the spd in case we error'ed out in filling the pipe.
+ */
+static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+ put_page(spd->pages[i]);
+}
+
+static struct page *linear_to_page(struct page *page, unsigned int *len,
+ unsigned int *offset,
+ struct sock *sk)
+{
+ struct page_frag *pfrag = sk_page_frag(sk);
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ return NULL;
+
+ *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
+
+ memcpy(page_address(pfrag->page) + pfrag->offset,
+ page_address(page) + *offset, *len);
+ *offset = pfrag->offset;
+ pfrag->offset += *len;
+
+ return pfrag->page;
+}
+
+static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
+ struct page *page,
+ unsigned int offset)
+{
+ return spd->nr_pages &&
+ spd->pages[spd->nr_pages - 1] == page &&
+ (spd->partial[spd->nr_pages - 1].offset +
+ spd->partial[spd->nr_pages - 1].len == offset);
+}
+
+/*
+ * Fill page/offset/length into spd, if it can hold more pages.
+ */
+static bool spd_fill_page(struct splice_pipe_desc *spd,
+ struct pipe_inode_info *pipe, struct page *page,
+ unsigned int *len, unsigned int offset,
+ bool linear,
+ struct sock *sk)
+{
+ if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
+ return true;
+
+ if (linear) {
+ page = linear_to_page(page, len, &offset, sk);
+ if (!page)
+ return true;
+ }
+ if (spd_can_coalesce(spd, page, offset)) {
+ spd->partial[spd->nr_pages - 1].len += *len;
+ return false;
+ }
+ get_page(page);
+ spd->pages[spd->nr_pages] = page;
+ spd->partial[spd->nr_pages].len = *len;
+ spd->partial[spd->nr_pages].offset = offset;
+ spd->nr_pages++;
+
+ return false;
+}
+
+static bool __splice_segment(struct page *page, unsigned int poff,
+ unsigned int plen, unsigned int *off,
+ unsigned int *len,
+ struct splice_pipe_desc *spd, bool linear,
+ struct sock *sk,
+ struct pipe_inode_info *pipe)
+{
+ if (!*len)
+ return true;
+
+ /* skip this segment if already processed */
+ if (*off >= plen) {
+ *off -= plen;
+ return false;
+ }
+
+ /* ignore any bits we already processed */
+ poff += *off;
+ plen -= *off;
+ *off = 0;
+
+ do {
+ unsigned int flen = min(*len, plen);
+
+ if (spd_fill_page(spd, pipe, page, &flen, poff,
+ linear, sk))
+ return true;
+ poff += flen;
+ plen -= flen;
+ *len -= flen;
+ } while (*len && plen);
+
+ return false;
+}
+
+/*
+ * Map linear and fragment data from the skb to spd. It reports true if the
+ * pipe is full or if we already spliced the requested length.
+ */
+static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
+ unsigned int *offset, unsigned int *len,
+ struct splice_pipe_desc *spd, struct sock *sk)
+{
+ int seg;
+ struct sk_buff *iter;
+
+ /* map the linear part :
+ * If skb->head_frag is set, this 'linear' part is backed by a
+ * fragment, and if the head is not shared with any clones then
+ * we can avoid a copy since we own the head portion of this page.
+ */
+ if (__splice_segment(virt_to_page(skb->data),
+ (unsigned long) skb->data & (PAGE_SIZE - 1),
+ skb_headlen(skb),
+ offset, len, spd,
+ skb_head_is_locked(skb),
+ sk, pipe))
+ return true;
+
+ /*
+ * then map the fragments
+ */
+ for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
+ const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
+
+ if (__splice_segment(skb_frag_page(f),
+ f->page_offset, skb_frag_size(f),
+ offset, len, spd, false, sk, pipe))
+ return true;
+ }
+
+ skb_walk_frags(skb, iter) {
+ if (*offset >= iter->len) {
+ *offset -= iter->len;
+ continue;
+ }
+ /* __skb_splice_bits() only fails if the output has no room
+ * left, so no point in going over the frag_list for the error
+ * case.
+ */
+ if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Map data from the skb to a pipe. Should handle both the linear part,
+ * the fragments, and the frag list.
+ */
+int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
+ struct pipe_inode_info *pipe, unsigned int tlen,
+ unsigned int flags)
+{
+ struct partial_page partial[MAX_SKB_FRAGS];
+ struct page *pages[MAX_SKB_FRAGS];
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .partial = partial,
+ .nr_pages_max = MAX_SKB_FRAGS,
+ .ops = &nosteal_pipe_buf_ops,
+ .spd_release = sock_spd_release,
+ };
+ int ret = 0;
+
+ __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
+
+ if (spd.nr_pages)
+ ret = splice_to_pipe(pipe, &spd);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(skb_splice_bits);
+
+/* Send skb data on a socket. Socket must be locked. */
+int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
+ int len)
+{
+ unsigned int orig_len = len;
+ struct sk_buff *head = skb;
+ unsigned short fragidx;
+ int slen, ret;
+
+do_frag_list:
+
+ /* Deal with head data */
+ while (offset < skb_headlen(skb) && len) {
+ struct kvec kv;
+ struct msghdr msg;
+
+ slen = min_t(int, len, skb_headlen(skb) - offset);
+ kv.iov_base = skb->data + offset;
+ kv.iov_len = slen;
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_flags = MSG_DONTWAIT;
+
+ ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
+ if (ret <= 0)
+ goto error;
+
+ offset += ret;
+ len -= ret;
+ }
+
+ /* All the data was skb head? */
+ if (!len)
+ goto out;
+
+ /* Make offset relative to start of frags */
+ offset -= skb_headlen(skb);
+
+ /* Find where we are in frag list */
+ for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
+
+ if (offset < frag->size)
+ break;
+
+ offset -= frag->size;
+ }
+
+ for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
+
+ slen = min_t(size_t, len, frag->size - offset);
+
+ while (slen) {
+ ret = kernel_sendpage_locked(sk, frag->page.p,
+ frag->page_offset + offset,
+ slen, MSG_DONTWAIT);
+ if (ret <= 0)
+ goto error;
+
+ len -= ret;
+ offset += ret;
+ slen -= ret;
+ }
+
+ offset = 0;
+ }
+
+ if (len) {
+ /* Process any frag lists */
+
+ if (skb == head) {
+ if (skb_has_frag_list(skb)) {
+ skb = skb_shinfo(skb)->frag_list;
+ goto do_frag_list;
+ }
+ } else if (skb->next) {
+ skb = skb->next;
+ goto do_frag_list;
+ }
+ }
+
+out:
+ return orig_len - len;
+
+error:
+ return orig_len == len ? ret : orig_len - len;
+}
+EXPORT_SYMBOL_GPL(skb_send_sock_locked);
+
+/* Send skb data on a socket. */
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
+{
+ int ret = 0;
+
+ lock_sock(sk);
+ ret = skb_send_sock_locked(sk, skb, offset, len);
+ release_sock(sk);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(skb_send_sock);
+
+/**
+ * skb_store_bits - store bits from kernel buffer to skb
+ * @skb: destination buffer
+ * @offset: offset in destination
+ * @from: source buffer
+ * @len: number of bytes to copy
+ *
+ * Copy the specified number of bytes from the source buffer to the
+ * destination skb. This function handles all the messy bits of
+ * traversing fragment lists and such.
+ */
+
+int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
+{
+ int start = skb_headlen(skb);
+ struct sk_buff *frag_iter;
+ int i, copy;
+
+ if (offset > (int)skb->len - len)
+ goto fault;
+
+ if ((copy = start - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ skb_copy_to_linear_data_offset(skb, offset, from, copy);
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ from += copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ if ((copy = end - offset) > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ if (copy > len)
+ copy = len;
+
+ skb_frag_foreach_page(frag,
+ frag->page_offset + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ memcpy(vaddr + p_off, from + copied, p_len);
+ kunmap_atomic(vaddr);
+ }
+
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ from += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_store_bits(frag_iter, offset - start,
+ from, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ from += copy;
+ }
+ start = end;
+ }
+ if (!len)
+ return 0;
+
+fault:
+ return -EFAULT;
+}
+EXPORT_SYMBOL(skb_store_bits);
+
+/* Checksum skb data. */
+__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
+ __wsum csum, const struct skb_checksum_ops *ops)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+ int pos = 0;
+
+ /* Checksum header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ csum = ops->update(skb->data + offset, copy, csum);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ pos = copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ if ((copy = end - offset) > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
+ __wsum csum2;
+ u8 *vaddr;
+
+ if (copy > len)
+ copy = len;
+
+ skb_frag_foreach_page(frag,
+ frag->page_offset + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ csum2 = ops->update(vaddr + p_off, p_len, 0);
+ kunmap_atomic(vaddr);
+ csum = ops->combine(csum, csum2, pos, p_len);
+ pos += p_len;
+ }
+
+ if (!(len -= copy))
+ return csum;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ __wsum csum2;
+ if (copy > len)
+ copy = len;
+ csum2 = __skb_checksum(frag_iter, offset - start,
+ copy, 0, ops);
+ csum = ops->combine(csum, csum2, pos, copy);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ pos += copy;
+ }
+ start = end;
+ }
+ BUG_ON(len);
+
+ return csum;
+}
+EXPORT_SYMBOL(__skb_checksum);
+
+__wsum skb_checksum(const struct sk_buff *skb, int offset,
+ int len, __wsum csum)
+{
+ const struct skb_checksum_ops ops = {
+ .update = csum_partial_ext,
+ .combine = csum_block_add_ext,
+ };
+
+ return __skb_checksum(skb, offset, len, csum, &ops);
+}
+EXPORT_SYMBOL(skb_checksum);
+
+/* Both of above in one bottle. */
+
+__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
+ u8 *to, int len, __wsum csum)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+ int pos = 0;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ csum = csum_partial_copy_nocheck(skb->data + offset, to,
+ copy, csum);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ to += copy;
+ pos = copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ if ((copy = end - offset) > 0) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ u32 p_off, p_len, copied;
+ struct page *p;
+ __wsum csum2;
+ u8 *vaddr;
+
+ if (copy > len)
+ copy = len;
+
+ skb_frag_foreach_page(frag,
+ frag->page_offset + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ csum2 = csum_partial_copy_nocheck(vaddr + p_off,
+ to + copied,
+ p_len, 0);
+ kunmap_atomic(vaddr);
+ csum = csum_block_add(csum, csum2, pos);
+ pos += p_len;
+ }
+
+ if (!(len -= copy))
+ return csum;
+ offset += copy;
+ to += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ __wsum csum2;
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ csum2 = skb_copy_and_csum_bits(frag_iter,
+ offset - start,
+ to, copy, 0);
+ csum = csum_block_add(csum, csum2, pos);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ to += copy;
+ pos += copy;
+ }
+ start = end;
+ }
+ BUG_ON(len);
+ return csum;
+}
+EXPORT_SYMBOL(skb_copy_and_csum_bits);
+
+static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
+{
+ net_warn_ratelimited(
+ "%s: attempt to compute crc32c without libcrc32c.ko\n",
+ __func__);
+ return 0;
+}
+
+static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
+ int offset, int len)
+{
+ net_warn_ratelimited(
+ "%s: attempt to compute crc32c without libcrc32c.ko\n",
+ __func__);
+ return 0;
+}
+
+static const struct skb_checksum_ops default_crc32c_ops = {
+ .update = warn_crc32c_csum_update,
+ .combine = warn_crc32c_csum_combine,
+};
+
+const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
+ &default_crc32c_ops;
+EXPORT_SYMBOL(crc32c_csum_stub);
+
+ /**
+ * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
+ * @from: source buffer
+ *
+ * Calculates the amount of linear headroom needed in the 'to' skb passed
+ * into skb_zerocopy().
+ */
+unsigned int
+skb_zerocopy_headlen(const struct sk_buff *from)
+{
+ unsigned int hlen = 0;
+
+ if (!from->head_frag ||
+ skb_headlen(from) < L1_CACHE_BYTES ||
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
+ hlen = skb_headlen(from);
+ if (!hlen)
+ hlen = from->len;
+ }
+
+ if (skb_has_frag_list(from))
+ hlen = from->len;
+
+ return hlen;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
+
+/**
+ * skb_zerocopy - Zero copy skb to skb
+ * @to: destination buffer
+ * @from: source buffer
+ * @len: number of bytes to copy from source buffer
+ * @hlen: size of linear headroom in destination buffer
+ *
+ * Copies up to `len` bytes from `from` to `to` by creating references
+ * to the frags in the source buffer.
+ *
+ * The `hlen` as calculated by skb_zerocopy_headlen() specifies the
+ * headroom in the `to` buffer.
+ *
+ * Return value:
+ * 0: everything is OK
+ * -ENOMEM: couldn't orphan frags of @from due to lack of memory
+ * -EFAULT: skb_copy_bits() found some problem with skb geometry
+ */
+int
+skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
+{
+ int i, j = 0;
+ int plen = 0; /* length of skb->head fragment */
+ int ret;
+ struct page *page;
+ unsigned int offset;
+
+ BUG_ON(!from->head_frag && !hlen);
+
+ /* dont bother with small payloads */
+ if (len <= skb_tailroom(to))
+ return skb_copy_bits(from, 0, skb_put(to, len), len);
+
+ if (hlen) {
+ ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
+ if (unlikely(ret))
+ return ret;
+ len -= hlen;
+ } else {
+ plen = min_t(int, skb_headlen(from), len);
+ if (plen) {
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+ __skb_fill_page_desc(to, 0, page, offset, plen);
+ get_page(page);
+ j = 1;
+ len -= plen;
+ }
+ }
+
+ to->truesize += len + plen;
+ to->len += len + plen;
+ to->data_len += len + plen;
+
+ if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
+ skb_tx_error(from);
+ return -ENOMEM;
+ }
+ skb_zerocopy_clone(to, from, GFP_ATOMIC);
+
+ for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
+ if (!len)
+ break;
+ skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
+ skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
+ len -= skb_shinfo(to)->frags[j].size;
+ skb_frag_ref(to, j);
+ j++;
+ }
+ skb_shinfo(to)->nr_frags = j;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy);
+
+void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
+{
+ __wsum csum;
+ long csstart;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ csstart = skb_checksum_start_offset(skb);
+ else
+ csstart = skb_headlen(skb);
+
+ BUG_ON(csstart > skb_headlen(skb));
+
+ skb_copy_from_linear_data(skb, to, csstart);
+
+ csum = 0;
+ if (csstart != skb->len)
+ csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
+ skb->len - csstart, 0);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ long csstuff = csstart + skb->csum_offset;
+
+ *((__sum16 *)(to + csstuff)) = csum_fold(csum);
+ }
+}
+EXPORT_SYMBOL(skb_copy_and_csum_dev);
+
+/**
+ * skb_dequeue - remove from the head of the queue
+ * @list: list to dequeue from
+ *
+ * Remove the head of the list. The list lock is taken so the function
+ * may be used safely with other locking list functions. The head item is
+ * returned or %NULL if the list is empty.
+ */
+
+struct sk_buff *skb_dequeue(struct sk_buff_head *list)
+{
+ unsigned long flags;
+ struct sk_buff *result;
+
+ spin_lock_irqsave(&list->lock, flags);
+ result = __skb_dequeue(list);
+ spin_unlock_irqrestore(&list->lock, flags);
+ return result;
+}
+EXPORT_SYMBOL(skb_dequeue);
+
+/**
+ * skb_dequeue_tail - remove from the tail of the queue
+ * @list: list to dequeue from
+ *
+ * Remove the tail of the list. The list lock is taken so the function
+ * may be used safely with other locking list functions. The tail item is
+ * returned or %NULL if the list is empty.
+ */
+struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
+{
+ unsigned long flags;
+ struct sk_buff *result;
+
+ spin_lock_irqsave(&list->lock, flags);
+ result = __skb_dequeue_tail(list);
+ spin_unlock_irqrestore(&list->lock, flags);
+ return result;
+}
+EXPORT_SYMBOL(skb_dequeue_tail);
+
+/**
+ * skb_queue_purge - empty a list
+ * @list: list to empty
+ *
+ * Delete all buffers on an &sk_buff list. Each buffer is removed from
+ * the list and one reference dropped. This function takes the list
+ * lock and is atomic with respect to other list locking functions.
+ */
+void skb_queue_purge(struct sk_buff_head *list)
+{
+ struct sk_buff *skb;
+ while ((skb = skb_dequeue(list)) != NULL)
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL(skb_queue_purge);
+
+/**
+ * skb_rbtree_purge - empty a skb rbtree
+ * @root: root of the rbtree to empty
+ * Return value: the sum of truesizes of all purged skbs.
+ *
+ * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
+ * the list and one reference dropped. This function does not take
+ * any lock. Synchronization should be handled by the caller (e.g., TCP
+ * out-of-order queue is protected by the socket lock).
+ */
+unsigned int skb_rbtree_purge(struct rb_root *root)
+{
+ struct rb_node *p = rb_first(root);
+ unsigned int sum = 0;
+
+ while (p) {
+ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, root);
+ sum += skb->truesize;
+ kfree_skb(skb);
+ }
+ return sum;
+}
+
+/**
+ * skb_queue_head - queue a buffer at the list head
+ * @list: list to use
+ * @newsk: buffer to queue
+ *
+ * Queue a buffer at the start of the list. This function takes the
+ * list lock and can be used safely with other locking &sk_buff functions
+ * safely.
+ *
+ * A buffer cannot be placed on two lists at the same time.
+ */
+void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_queue_head(list, newsk);
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+EXPORT_SYMBOL(skb_queue_head);
+
+/**
+ * skb_queue_tail - queue a buffer at the list tail
+ * @list: list to use
+ * @newsk: buffer to queue
+ *
+ * Queue a buffer at the tail of the list. This function takes the
+ * list lock and can be used safely with other locking &sk_buff functions
+ * safely.
+ *
+ * A buffer cannot be placed on two lists at the same time.
+ */
+void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_queue_tail(list, newsk);
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+EXPORT_SYMBOL(skb_queue_tail);
+
+/**
+ * skb_unlink - remove a buffer from a list
+ * @skb: buffer to remove
+ * @list: list to use
+ *
+ * Remove a packet from a list. The list locks are taken and this
+ * function is atomic with respect to other list locked calls
+ *
+ * You must know what list the SKB is on.
+ */
+void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_unlink(skb, list);
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+EXPORT_SYMBOL(skb_unlink);
+
+/**
+ * skb_append - append a buffer
+ * @old: buffer to insert after
+ * @newsk: buffer to insert
+ * @list: list to use
+ *
+ * Place a packet after a given packet in a list. The list locks are taken
+ * and this function is atomic with respect to other list locked calls.
+ * A buffer cannot be placed on two lists at the same time.
+ */
+void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_queue_after(list, old, newsk);
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+EXPORT_SYMBOL(skb_append);
+
+/**
+ * skb_insert - insert a buffer
+ * @old: buffer to insert before
+ * @newsk: buffer to insert
+ * @list: list to use
+ *
+ * Place a packet before a given packet in a list. The list locks are
+ * taken and this function is atomic with respect to other list locked
+ * calls.
+ *
+ * A buffer cannot be placed on two lists at the same time.
+ */
+void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_insert(newsk, old->prev, old, list);
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+EXPORT_SYMBOL(skb_insert);
+
+static inline void skb_split_inside_header(struct sk_buff *skb,
+ struct sk_buff* skb1,
+ const u32 len, const int pos)
+{
+ int i;
+
+ skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
+ pos - len);
+ /* And move data appendix as is. */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
+
+ skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
+ skb_shinfo(skb)->nr_frags = 0;
+ skb1->data_len = skb->data_len;
+ skb1->len += skb1->data_len;
+ skb->data_len = 0;
+ skb->len = len;
+ skb_set_tail_pointer(skb, len);
+}
+
+static inline void skb_split_no_header(struct sk_buff *skb,
+ struct sk_buff* skb1,
+ const u32 len, int pos)
+{
+ int i, k = 0;
+ const int nfrags = skb_shinfo(skb)->nr_frags;
+
+ skb_shinfo(skb)->nr_frags = 0;
+ skb1->len = skb1->data_len = skb->len - len;
+ skb->len = len;
+ skb->data_len = len - pos;
+
+ for (i = 0; i < nfrags; i++) {
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (pos + size > len) {
+ skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
+
+ if (pos < len) {
+ /* Split frag.
+ * We have two variants in this case:
+ * 1. Move all the frag to the second
+ * part, if it is possible. F.e.
+ * this approach is mandatory for TUX,
+ * where splitting is expensive.
+ * 2. Split is accurately. We make this.
+ */
+ skb_frag_ref(skb, i);
+ skb_shinfo(skb1)->frags[0].page_offset += len - pos;
+ skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
+ skb_shinfo(skb)->nr_frags++;
+ }
+ k++;
+ } else
+ skb_shinfo(skb)->nr_frags++;
+ pos += size;
+ }
+ skb_shinfo(skb1)->nr_frags = k;
+}
+
+/**
+ * skb_split - Split fragmented skb to two parts at length len.
+ * @skb: the buffer to split
+ * @skb1: the buffer to receive the second part
+ * @len: new length for skb
+ */
+void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
+{
+ int pos = skb_headlen(skb);
+
+ skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags &
+ SKBTX_SHARED_FRAG;
+ skb_zerocopy_clone(skb1, skb, 0);
+ if (len < pos) /* Split line is inside header. */
+ skb_split_inside_header(skb, skb1, len, pos);
+ else /* Second chunk has no header, nothing to copy. */
+ skb_split_no_header(skb, skb1, len, pos);
+}
+EXPORT_SYMBOL(skb_split);
+
+/* Shifting from/to a cloned skb is a no-go.
+ *
+ * Caller cannot keep skb_shinfo related pointers past calling here!
+ */
+static int skb_prepare_for_shift(struct sk_buff *skb)
+{
+ int ret = 0;
+
+ if (skb_cloned(skb)) {
+ /* Save and restore truesize: pskb_expand_head() may reallocate
+ * memory where ksize(kmalloc(S)) != ksize(kmalloc(S)), but we
+ * cannot change truesize at this point.
+ */
+ unsigned int save_truesize = skb->truesize;
+
+ ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ skb->truesize = save_truesize;
+ }
+ return ret;
+}
+
+/**
+ * skb_shift - Shifts paged data partially from skb to another
+ * @tgt: buffer into which tail data gets added
+ * @skb: buffer from which the paged data comes from
+ * @shiftlen: shift up to this many bytes
+ *
+ * Attempts to shift up to shiftlen worth of bytes, which may be less than
+ * the length of the skb, from skb to tgt. Returns number bytes shifted.
+ * It's up to caller to free skb if everything was shifted.
+ *
+ * If @tgt runs out of frags, the whole operation is aborted.
+ *
+ * Skb cannot include anything else but paged data while tgt is allowed
+ * to have non-paged data as well.
+ *
+ * TODO: full sized shift could be optimized but that would need
+ * specialized skb free'er to handle frags without up-to-date nr_frags.
+ */
+int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
+{
+ int from, to, merge, todo;
+ struct skb_frag_struct *fragfrom, *fragto;
+
+ BUG_ON(shiftlen > skb->len);
+
+ if (skb_headlen(skb))
+ return 0;
+ if (skb_zcopy(tgt) || skb_zcopy(skb))
+ return 0;
+
+ todo = shiftlen;
+ from = 0;
+ to = skb_shinfo(tgt)->nr_frags;
+ fragfrom = &skb_shinfo(skb)->frags[from];
+
+ /* Actual merge is delayed until the point when we know we can
+ * commit all, so that we don't have to undo partial changes
+ */
+ if (!to ||
+ !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
+ fragfrom->page_offset)) {
+ merge = -1;
+ } else {
+ merge = to - 1;
+
+ todo -= skb_frag_size(fragfrom);
+ if (todo < 0) {
+ if (skb_prepare_for_shift(skb) ||
+ skb_prepare_for_shift(tgt))
+ return 0;
+
+ /* All previous frag pointers might be stale! */
+ fragfrom = &skb_shinfo(skb)->frags[from];
+ fragto = &skb_shinfo(tgt)->frags[merge];
+
+ skb_frag_size_add(fragto, shiftlen);
+ skb_frag_size_sub(fragfrom, shiftlen);
+ fragfrom->page_offset += shiftlen;
+
+ goto onlymerged;
+ }
+
+ from++;
+ }
+
+ /* Skip full, not-fitting skb to avoid expensive operations */
+ if ((shiftlen == skb->len) &&
+ (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
+ return 0;
+
+ if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
+ return 0;
+
+ while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
+ if (to == MAX_SKB_FRAGS)
+ return 0;
+
+ fragfrom = &skb_shinfo(skb)->frags[from];
+ fragto = &skb_shinfo(tgt)->frags[to];
+
+ if (todo >= skb_frag_size(fragfrom)) {
+ *fragto = *fragfrom;
+ todo -= skb_frag_size(fragfrom);
+ from++;
+ to++;
+
+ } else {
+ __skb_frag_ref(fragfrom);
+ fragto->page = fragfrom->page;
+ fragto->page_offset = fragfrom->page_offset;
+ skb_frag_size_set(fragto, todo);
+
+ fragfrom->page_offset += todo;
+ skb_frag_size_sub(fragfrom, todo);
+ todo = 0;
+
+ to++;
+ break;
+ }
+ }
+
+ /* Ready to "commit" this state change to tgt */
+ skb_shinfo(tgt)->nr_frags = to;
+
+ if (merge >= 0) {
+ fragfrom = &skb_shinfo(skb)->frags[0];
+ fragto = &skb_shinfo(tgt)->frags[merge];
+
+ skb_frag_size_add(fragto, skb_frag_size(fragfrom));
+ __skb_frag_unref(fragfrom);
+ }
+
+ /* Reposition in the original skb */
+ to = 0;
+ while (from < skb_shinfo(skb)->nr_frags)
+ skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
+ skb_shinfo(skb)->nr_frags = to;
+
+ BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
+
+onlymerged:
+ /* Most likely the tgt won't ever need its checksum anymore, skb on
+ * the other hand might need it if it needs to be resent
+ */
+ tgt->ip_summed = CHECKSUM_PARTIAL;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ /* Yak, is it really working this way? Some helper please? */
+ skb->len -= shiftlen;
+ skb->data_len -= shiftlen;
+ skb->truesize -= shiftlen;
+ tgt->len += shiftlen;
+ tgt->data_len += shiftlen;
+ tgt->truesize += shiftlen;
+
+ return shiftlen;
+}
+
+/**
+ * skb_prepare_seq_read - Prepare a sequential read of skb data
+ * @skb: the buffer to read
+ * @from: lower offset of data to be read
+ * @to: upper offset of data to be read
+ * @st: state variable
+ *
+ * Initializes the specified state variable. Must be called before
+ * invoking skb_seq_read() for the first time.
+ */
+void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
+ unsigned int to, struct skb_seq_state *st)
+{
+ st->lower_offset = from;
+ st->upper_offset = to;
+ st->root_skb = st->cur_skb = skb;
+ st->frag_idx = st->stepped_offset = 0;
+ st->frag_data = NULL;
+}
+EXPORT_SYMBOL(skb_prepare_seq_read);
+
+/**
+ * skb_seq_read - Sequentially read skb data
+ * @consumed: number of bytes consumed by the caller so far
+ * @data: destination pointer for data to be returned
+ * @st: state variable
+ *
+ * Reads a block of skb data at @consumed relative to the
+ * lower offset specified to skb_prepare_seq_read(). Assigns
+ * the head of the data block to @data and returns the length
+ * of the block or 0 if the end of the skb data or the upper
+ * offset has been reached.
+ *
+ * The caller is not required to consume all of the data
+ * returned, i.e. @consumed is typically set to the number
+ * of bytes already consumed and the next call to
+ * skb_seq_read() will return the remaining part of the block.
+ *
+ * Note 1: The size of each block of data returned can be arbitrary,
+ * this limitation is the cost for zerocopy sequential
+ * reads of potentially non linear data.
+ *
+ * Note 2: Fragment lists within fragments are not implemented
+ * at the moment, state->root_skb could be replaced with
+ * a stack for this purpose.
+ */
+unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
+ struct skb_seq_state *st)
+{
+ unsigned int block_limit, abs_offset = consumed + st->lower_offset;
+ skb_frag_t *frag;
+
+ if (unlikely(abs_offset >= st->upper_offset)) {
+ if (st->frag_data) {
+ kunmap_atomic(st->frag_data);
+ st->frag_data = NULL;
+ }
+ return 0;
+ }
+
+next_skb:
+ block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
+
+ if (abs_offset < block_limit && !st->frag_data) {
+ *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
+ return block_limit - abs_offset;
+ }
+
+ if (st->frag_idx == 0 && !st->frag_data)
+ st->stepped_offset += skb_headlen(st->cur_skb);
+
+ while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+ frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
+ block_limit = skb_frag_size(frag) + st->stepped_offset;
+
+ if (abs_offset < block_limit) {
+ if (!st->frag_data)
+ st->frag_data = kmap_atomic(skb_frag_page(frag));
+
+ *data = (u8 *) st->frag_data + frag->page_offset +
+ (abs_offset - st->stepped_offset);
+
+ return block_limit - abs_offset;
+ }
+
+ if (st->frag_data) {
+ kunmap_atomic(st->frag_data);
+ st->frag_data = NULL;
+ }
+
+ st->frag_idx++;
+ st->stepped_offset += skb_frag_size(frag);
+ }
+
+ if (st->frag_data) {
+ kunmap_atomic(st->frag_data);
+ st->frag_data = NULL;
+ }
+
+ if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
+ st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
+ st->frag_idx = 0;
+ goto next_skb;
+ } else if (st->cur_skb->next) {
+ st->cur_skb = st->cur_skb->next;
+ st->frag_idx = 0;
+ goto next_skb;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_seq_read);
+
+/**
+ * skb_abort_seq_read - Abort a sequential read of skb data
+ * @st: state variable
+ *
+ * Must be called if skb_seq_read() was not called until it
+ * returned 0.
+ */
+void skb_abort_seq_read(struct skb_seq_state *st)
+{
+ if (st->frag_data)
+ kunmap_atomic(st->frag_data);
+}
+EXPORT_SYMBOL(skb_abort_seq_read);
+
+#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
+
+static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
+ struct ts_config *conf,
+ struct ts_state *state)
+{
+ return skb_seq_read(offset, text, TS_SKB_CB(state));
+}
+
+static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
+{
+ skb_abort_seq_read(TS_SKB_CB(state));
+}
+
+/**
+ * skb_find_text - Find a text pattern in skb data
+ * @skb: the buffer to look in
+ * @from: search offset
+ * @to: search limit
+ * @config: textsearch configuration
+ *
+ * Finds a pattern in the skb data according to the specified
+ * textsearch configuration. Use textsearch_next() to retrieve
+ * subsequent occurrences of the pattern. Returns the offset
+ * to the first occurrence or UINT_MAX if no match was found.
+ */
+unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
+ unsigned int to, struct ts_config *config)
+{
+ struct ts_state state;
+ unsigned int ret;
+
+ config->get_next_block = skb_ts_get_next_block;
+ config->finish = skb_ts_finish;
+
+ skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
+
+ ret = textsearch_find(config, &state);
+ return (ret <= to - from ? ret : UINT_MAX);
+}
+EXPORT_SYMBOL(skb_find_text);
+
+/**
+ * skb_append_datato_frags - append the user data to a skb
+ * @sk: sock structure
+ * @skb: skb structure to be appended with user data.
+ * @getfrag: call back function to be used for getting the user data
+ * @from: pointer to user message iov
+ * @length: length of the iov message
+ *
+ * Description: This procedure append the user data in the fragment part
+ * of the skb if any page alloc fails user this procedure returns -ENOMEM
+ */
+int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
+ int (*getfrag)(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length)
+{
+ int frg_cnt = skb_shinfo(skb)->nr_frags;
+ int copy;
+ int offset = 0;
+ int ret;
+ struct page_frag *pfrag = &current->task_frag;
+
+ do {
+ /* Return error if we don't have space for new frag */
+ if (frg_cnt >= MAX_SKB_FRAGS)
+ return -EMSGSIZE;
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ return -ENOMEM;
+
+ /* copy the user data to page */
+ copy = min_t(int, length, pfrag->size - pfrag->offset);
+
+ ret = getfrag(from, page_address(pfrag->page) + pfrag->offset,
+ offset, copy, 0, skb);
+ if (ret < 0)
+ return -EFAULT;
+
+ /* copy was successful so update the size parameters */
+ skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset,
+ copy);
+ frg_cnt++;
+ pfrag->offset += copy;
+ get_page(pfrag->page);
+
+ skb->truesize += copy;
+ refcount_add(copy, &sk->sk_wmem_alloc);
+ skb->len += copy;
+ skb->data_len += copy;
+ offset += copy;
+ length -= copy;
+
+ } while (length > 0);
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_append_datato_frags);
+
+int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
+ int offset, size_t size)
+{
+ int i = skb_shinfo(skb)->nr_frags;
+
+ if (skb_can_coalesce(skb, i, page, offset)) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
+ } else if (i < MAX_SKB_FRAGS) {
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, offset, size);
+ } else {
+ return -EMSGSIZE;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_append_pagefrags);
+
+/**
+ * skb_pull_rcsum - pull skb and update receive checksum
+ * @skb: buffer to update
+ * @len: length of data pulled
+ *
+ * This function performs an skb_pull on the packet and updates
+ * the CHECKSUM_COMPLETE checksum. It should be used on
+ * receive path processing instead of skb_pull unless you know
+ * that the checksum difference is zero (e.g., a valid IP header)
+ * or you are setting ip_summed to CHECKSUM_NONE.
+ */
+void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
+{
+ unsigned char *data = skb->data;
+
+ BUG_ON(len > skb->len);
+ __skb_pull(skb, len);
+ skb_postpull_rcsum(skb, data, len);
+ return skb->data;
+}
+EXPORT_SYMBOL_GPL(skb_pull_rcsum);
+
+static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
+{
+ skb_frag_t head_frag;
+ struct page *page;
+
+ page = virt_to_head_page(frag_skb->head);
+ head_frag.page.p = page;
+ head_frag.page_offset = frag_skb->data -
+ (unsigned char *)page_address(page);
+ head_frag.size = skb_headlen(frag_skb);
+ return head_frag;
+}
+
+/**
+ * skb_segment - Perform protocol segmentation on skb.
+ * @head_skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ *
+ * This function performs segmentation on the given skb. It returns
+ * a pointer to the first in a list of new skbs for the segments.
+ * In case of error it returns ERR_PTR(err).
+ */
+struct sk_buff *skb_segment(struct sk_buff *head_skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = NULL;
+ struct sk_buff *tail = NULL;
+ struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
+ skb_frag_t *frag = skb_shinfo(head_skb)->frags;
+ unsigned int mss = skb_shinfo(head_skb)->gso_size;
+ unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
+ struct sk_buff *frag_skb = head_skb;
+ unsigned int offset = doffset;
+ unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
+ unsigned int partial_segs = 0;
+ unsigned int headroom;
+ unsigned int len = head_skb->len;
+ __be16 proto;
+ bool csum, sg;
+ int nfrags = skb_shinfo(head_skb)->nr_frags;
+ int err = -ENOMEM;
+ int i = 0;
+ int pos;
+ int dummy;
+
+ if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) &&
+ (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) {
+ /* gso_size is untrusted, and we have a frag_list with a linear
+ * non head_frag head.
+ *
+ * (we assume checking the first list_skb member suffices;
+ * i.e if either of the list_skb members have non head_frag
+ * head, then the first one has too).
+ *
+ * If head_skb's headlen does not fit requested gso_size, it
+ * means that the frag_list members do NOT terminate on exact
+ * gso_size boundaries. Hence we cannot perform skb_frag_t page
+ * sharing. Therefore we must fallback to copying the frag_list
+ * skbs; we do so by disabling SG.
+ */
+ if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb))
+ features &= ~NETIF_F_SG;
+ }
+
+ __skb_push(head_skb, doffset);
+ proto = skb_network_protocol(head_skb, &dummy);
+ if (unlikely(!proto))
+ return ERR_PTR(-EINVAL);
+
+ sg = !!(features & NETIF_F_SG);
+ csum = !!can_checksum_protocol(features, proto);
+
+ if (sg && csum && (mss != GSO_BY_FRAGS)) {
+ if (!(features & NETIF_F_GSO_PARTIAL)) {
+ struct sk_buff *iter;
+ unsigned int frag_len;
+
+ if (!list_skb ||
+ !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
+ goto normal;
+
+ /* If we get here then all the required
+ * GSO features except frag_list are supported.
+ * Try to split the SKB to multiple GSO SKBs
+ * with no frag_list.
+ * Currently we can do that only when the buffers don't
+ * have a linear part and all the buffers except
+ * the last are of the same length.
+ */
+ frag_len = list_skb->len;
+ skb_walk_frags(head_skb, iter) {
+ if (frag_len != iter->len && iter->next)
+ goto normal;
+ if (skb_headlen(iter) && !iter->head_frag)
+ goto normal;
+
+ len -= iter->len;
+ }
+
+ if (len != frag_len)
+ goto normal;
+ }
+
+ /* GSO partial only requires that we trim off any excess that
+ * doesn't fit into an MSS sized block, so take care of that
+ * now.
+ */
+ partial_segs = len / mss;
+ if (partial_segs > 1)
+ mss *= partial_segs;
+ else
+ partial_segs = 0;
+ }
+
+normal:
+ headroom = skb_headroom(head_skb);
+ pos = skb_headlen(head_skb);
+
+ do {
+ struct sk_buff *nskb;
+ skb_frag_t *nskb_frag;
+ int hsize;
+ int size;
+
+ if (unlikely(mss == GSO_BY_FRAGS)) {
+ len = list_skb->len;
+ } else {
+ len = head_skb->len - offset;
+ if (len > mss)
+ len = mss;
+ }
+
+ hsize = skb_headlen(head_skb) - offset;
+ if (hsize < 0)
+ hsize = 0;
+ if (hsize > len || !sg)
+ hsize = len;
+
+ if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
+ (skb_headlen(list_skb) == len || sg)) {
+ BUG_ON(skb_headlen(list_skb) > len);
+
+ i = 0;
+ nfrags = skb_shinfo(list_skb)->nr_frags;
+ frag = skb_shinfo(list_skb)->frags;
+ frag_skb = list_skb;
+ pos += skb_headlen(list_skb);
+
+ while (pos < offset + len) {
+ BUG_ON(i >= nfrags);
+
+ size = skb_frag_size(frag);
+ if (pos + size > offset + len)
+ break;
+
+ i++;
+ pos += size;
+ frag++;
+ }
+
+ nskb = skb_clone(list_skb, GFP_ATOMIC);
+ list_skb = list_skb->next;
+
+ if (unlikely(!nskb))
+ goto err;
+
+ if (unlikely(pskb_trim(nskb, len))) {
+ kfree_skb(nskb);
+ goto err;
+ }
+
+ hsize = skb_end_offset(nskb);
+ if (skb_cow_head(nskb, doffset + headroom)) {
+ kfree_skb(nskb);
+ goto err;
+ }
+
+ nskb->truesize += skb_end_offset(nskb) - hsize;
+ skb_release_head_state(nskb);
+ __skb_push(nskb, doffset);
+ } else {
+ nskb = __alloc_skb(hsize + doffset + headroom,
+ GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
+ NUMA_NO_NODE);
+
+ if (unlikely(!nskb))
+ goto err;
+
+ skb_reserve(nskb, headroom);
+ __skb_put(nskb, doffset);
+ }
+
+ if (segs)
+ tail->next = nskb;
+ else
+ segs = nskb;
+ tail = nskb;
+
+ __copy_skb_header(nskb, head_skb);
+
+ skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
+ skb_reset_mac_len(nskb);
+
+ skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
+ nskb->data - tnl_hlen,
+ doffset + tnl_hlen);
+
+ if (nskb->len == len + doffset)
+ goto perform_csum_check;
+
+ if (!sg) {
+ if (!nskb->remcsum_offload)
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum =
+ skb_copy_and_csum_bits(head_skb, offset,
+ skb_put(nskb, len),
+ len, 0);
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
+ continue;
+ }
+
+ nskb_frag = skb_shinfo(nskb)->frags;
+
+ skb_copy_from_linear_data_offset(head_skb, offset,
+ skb_put(nskb, hsize), hsize);
+
+ skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
+ SKBTX_SHARED_FRAG;
+
+ if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
+ skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
+ goto err;
+
+ while (pos < offset + len) {
+ if (i >= nfrags) {
+ i = 0;
+ nfrags = skb_shinfo(list_skb)->nr_frags;
+ frag = skb_shinfo(list_skb)->frags;
+ frag_skb = list_skb;
+ if (!skb_headlen(list_skb)) {
+ BUG_ON(!nfrags);
+ } else {
+ BUG_ON(!list_skb->head_frag);
+
+ /* to make room for head_frag. */
+ i--;
+ frag--;
+ }
+ if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
+ skb_zerocopy_clone(nskb, frag_skb,
+ GFP_ATOMIC))
+ goto err;
+
+ list_skb = list_skb->next;
+ }
+
+ if (unlikely(skb_shinfo(nskb)->nr_frags >=
+ MAX_SKB_FRAGS)) {
+ net_warn_ratelimited(
+ "skb_segment: too many frags: %u %u\n",
+ pos, mss);
+ err = -EINVAL;
+ goto err;
+ }
+
+ *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
+ __skb_frag_ref(nskb_frag);
+ size = skb_frag_size(nskb_frag);
+
+ if (pos < offset) {
+ nskb_frag->page_offset += offset - pos;
+ skb_frag_size_sub(nskb_frag, offset - pos);
+ }
+
+ skb_shinfo(nskb)->nr_frags++;
+
+ if (pos + size <= offset + len) {
+ i++;
+ frag++;
+ pos += size;
+ } else {
+ skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
+ goto skip_fraglist;
+ }
+
+ nskb_frag++;
+ }
+
+skip_fraglist:
+ nskb->data_len = len - hsize;
+ nskb->len += nskb->data_len;
+ nskb->truesize += nskb->data_len;
+
+perform_csum_check:
+ if (!csum) {
+ if (skb_has_shared_frag(nskb) &&
+ __skb_linearize(nskb))
+ goto err;
+
+ if (!nskb->remcsum_offload)
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum =
+ skb_checksum(nskb, doffset,
+ nskb->len - doffset, 0);
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
+ }
+ } while ((offset += len) < head_skb->len);
+
+ /* Some callers want to get the end of the list.
+ * Put it in segs->prev to avoid walking the list.
+ * (see validate_xmit_skb_list() for example)
+ */
+ segs->prev = tail;
+
+ if (partial_segs) {
+ struct sk_buff *iter;
+ int type = skb_shinfo(head_skb)->gso_type;
+ unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
+
+ /* Update type to add partial and then remove dodgy if set */
+ type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
+ type &= ~SKB_GSO_DODGY;
+
+ /* Update GSO info and prepare to start updating headers on
+ * our way back down the stack of protocols.
+ */
+ for (iter = segs; iter; iter = iter->next) {
+ skb_shinfo(iter)->gso_size = gso_size;
+ skb_shinfo(iter)->gso_segs = partial_segs;
+ skb_shinfo(iter)->gso_type = type;
+ SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
+ }
+
+ if (tail->len - doffset <= gso_size)
+ skb_shinfo(tail)->gso_size = 0;
+ else if (tail != segs)
+ skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
+ }
+
+ /* Following permits correct backpressure, for protocols
+ * using skb_set_owner_w().
+ * Idea is to tranfert ownership from head_skb to last segment.
+ */
+ if (head_skb->destructor == sock_wfree) {
+ swap(tail->truesize, head_skb->truesize);
+ swap(tail->destructor, head_skb->destructor);
+ swap(tail->sk, head_skb->sk);
+ }
+ return segs;
+
+err:
+ kfree_skb_list(segs);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(skb_segment);
+
+int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
+{
+ struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
+ unsigned int offset = skb_gro_offset(skb);
+ unsigned int headlen = skb_headlen(skb);
+ unsigned int len = skb_gro_len(skb);
+ unsigned int delta_truesize;
+ struct sk_buff *lp;
+
+ if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush))
+ return -E2BIG;
+
+ lp = NAPI_GRO_CB(p)->last;
+ pinfo = skb_shinfo(lp);
+
+ if (headlen <= offset) {
+ skb_frag_t *frag;
+ skb_frag_t *frag2;
+ int i = skbinfo->nr_frags;
+ int nr_frags = pinfo->nr_frags + i;
+
+ if (nr_frags > MAX_SKB_FRAGS)
+ goto merge;
+
+ offset -= headlen;
+ pinfo->nr_frags = nr_frags;
+ skbinfo->nr_frags = 0;
+
+ frag = pinfo->frags + nr_frags;
+ frag2 = skbinfo->frags + i;
+ do {
+ *--frag = *--frag2;
+ } while (--i);
+
+ frag->page_offset += offset;
+ skb_frag_size_sub(frag, offset);
+
+ /* all fragments truesize : remove (head size + sk_buff) */
+ delta_truesize = skb->truesize -
+ SKB_TRUESIZE(skb_end_offset(skb));
+
+ skb->truesize -= skb->data_len;
+ skb->len -= skb->data_len;
+ skb->data_len = 0;
+
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
+ goto done;
+ } else if (skb->head_frag) {
+ int nr_frags = pinfo->nr_frags;
+ skb_frag_t *frag = pinfo->frags + nr_frags;
+ struct page *page = virt_to_head_page(skb->head);
+ unsigned int first_size = headlen - offset;
+ unsigned int first_offset;
+
+ if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
+ goto merge;
+
+ first_offset = skb->data -
+ (unsigned char *)page_address(page) +
+ offset;
+
+ pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
+
+ frag->page.p = page;
+ frag->page_offset = first_offset;
+ skb_frag_size_set(frag, first_size);
+
+ memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
+ /* We dont need to clear skbinfo->nr_frags here */
+
+ delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
+ goto done;
+ }
+
+merge:
+ delta_truesize = skb->truesize;
+ if (offset > headlen) {
+ unsigned int eat = offset - headlen;
+
+ skbinfo->frags[0].page_offset += eat;
+ skb_frag_size_sub(&skbinfo->frags[0], eat);
+ skb->data_len -= eat;
+ skb->len -= eat;
+ offset = headlen;
+ }
+
+ __skb_pull(skb, offset);
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+ NAPI_GRO_CB(p)->last = skb;
+ __skb_header_release(skb);
+ lp = p;
+
+done:
+ NAPI_GRO_CB(p)->count++;
+ p->data_len += len;
+ p->truesize += delta_truesize;
+ p->len += len;
+ if (lp != p) {
+ lp->data_len += len;
+ lp->truesize += delta_truesize;
+ lp->len += len;
+ }
+ NAPI_GRO_CB(skb)->same_flow = 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_gro_receive);
+
+void __init skb_init(void)
+{
+ skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache",
+ sizeof(struct sk_buff),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ offsetof(struct sk_buff, cb),
+ sizeof_field(struct sk_buff, cb),
+ NULL);
+ skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
+ sizeof(struct sk_buff_fclones),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ NULL);
+}
+
+static int
+__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
+ unsigned int recursion_level)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+ int elt = 0;
+
+ if (unlikely(recursion_level >= 24))
+ return -EMSGSIZE;
+
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ sg_set_buf(sg, skb->data + offset, copy);
+ elt++;
+ if ((len -= copy) == 0)
+ return elt;
+ offset += copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ if ((copy = end - offset) > 0) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ if (unlikely(elt && sg_is_last(&sg[elt - 1])))
+ return -EMSGSIZE;
+
+ if (copy > len)
+ copy = len;
+ sg_set_page(&sg[elt], skb_frag_page(frag), copy,
+ frag->page_offset+offset-start);
+ elt++;
+ if (!(len -= copy))
+ return elt;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end, ret;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (unlikely(elt && sg_is_last(&sg[elt - 1])))
+ return -EMSGSIZE;
+
+ if (copy > len)
+ copy = len;
+ ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
+ copy, recursion_level + 1);
+ if (unlikely(ret < 0))
+ return ret;
+ elt += ret;
+ if ((len -= copy) == 0)
+ return elt;
+ offset += copy;
+ }
+ start = end;
+ }
+ BUG_ON(len);
+ return elt;
+}
+
+/**
+ * skb_to_sgvec - Fill a scatter-gather list from a socket buffer
+ * @skb: Socket buffer containing the buffers to be mapped
+ * @sg: The scatter-gather list to map into
+ * @offset: The offset into the buffer's contents to start mapping
+ * @len: Length of buffer space to be mapped
+ *
+ * Fill the specified scatter-gather list with mappings/pointers into a
+ * region of the buffer space attached to a socket buffer. Returns either
+ * the number of scatterlist items used, or -EMSGSIZE if the contents
+ * could not fit.
+ */
+int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
+{
+ int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);
+
+ if (nsg <= 0)
+ return nsg;
+
+ sg_mark_end(&sg[nsg - 1]);
+
+ return nsg;
+}
+EXPORT_SYMBOL_GPL(skb_to_sgvec);
+
+/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
+ * sglist without mark the sg which contain last skb data as the end.
+ * So the caller can mannipulate sg list as will when padding new data after
+ * the first call without calling sg_unmark_end to expend sg list.
+ *
+ * Scenario to use skb_to_sgvec_nomark:
+ * 1. sg_init_table
+ * 2. skb_to_sgvec_nomark(payload1)
+ * 3. skb_to_sgvec_nomark(payload2)
+ *
+ * This is equivalent to:
+ * 1. sg_init_table
+ * 2. skb_to_sgvec(payload1)
+ * 3. sg_unmark_end
+ * 4. skb_to_sgvec(payload2)
+ *
+ * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
+ * is more preferable.
+ */
+int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
+ int offset, int len)
+{
+ return __skb_to_sgvec(skb, sg, offset, len, 0);
+}
+EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
+
+
+
+/**
+ * skb_cow_data - Check that a socket buffer's data buffers are writable
+ * @skb: The socket buffer to check.
+ * @tailbits: Amount of trailing space to be added
+ * @trailer: Returned pointer to the skb where the @tailbits space begins
+ *
+ * Make sure that the data buffers attached to a socket buffer are
+ * writable. If they are not, private copies are made of the data buffers
+ * and the socket buffer is set to use these instead.
+ *
+ * If @tailbits is given, make sure that there is space to write @tailbits
+ * bytes of data beyond current end of socket buffer. @trailer will be
+ * set to point to the skb in which this space begins.
+ *
+ * The number of scatterlist elements required to completely map the
+ * COW'd and extended socket buffer will be returned.
+ */
+int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
+{
+ int copyflag;
+ int elt;
+ struct sk_buff *skb1, **skb_p;
+
+ /* If skb is cloned or its head is paged, reallocate
+ * head pulling out all the pages (pages are considered not writable
+ * at the moment even if they are anonymous).
+ */
+ if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
+ __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL)
+ return -ENOMEM;
+
+ /* Easy case. Most of packets will go this way. */
+ if (!skb_has_frag_list(skb)) {
+ /* A little of trouble, not enough of space for trailer.
+ * This should not happen, when stack is tuned to generate
+ * good frames. OK, on miss we reallocate and reserve even more
+ * space, 128 bytes is fair. */
+
+ if (skb_tailroom(skb) < tailbits &&
+ pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
+ return -ENOMEM;
+
+ /* Voila! */
+ *trailer = skb;
+ return 1;
+ }
+
+ /* Misery. We are in troubles, going to mincer fragments... */
+
+ elt = 1;
+ skb_p = &skb_shinfo(skb)->frag_list;
+ copyflag = 0;
+
+ while ((skb1 = *skb_p) != NULL) {
+ int ntail = 0;
+
+ /* The fragment is partially pulled by someone,
+ * this can happen on input. Copy it and everything
+ * after it. */
+
+ if (skb_shared(skb1))
+ copyflag = 1;
+
+ /* If the skb is the last, worry about trailer. */
+
+ if (skb1->next == NULL && tailbits) {
+ if (skb_shinfo(skb1)->nr_frags ||
+ skb_has_frag_list(skb1) ||
+ skb_tailroom(skb1) < tailbits)
+ ntail = tailbits + 128;
+ }
+
+ if (copyflag ||
+ skb_cloned(skb1) ||
+ ntail ||
+ skb_shinfo(skb1)->nr_frags ||
+ skb_has_frag_list(skb1)) {
+ struct sk_buff *skb2;
+
+ /* Fuck, we are miserable poor guys... */
+ if (ntail == 0)
+ skb2 = skb_copy(skb1, GFP_ATOMIC);
+ else
+ skb2 = skb_copy_expand(skb1,
+ skb_headroom(skb1),
+ ntail,
+ GFP_ATOMIC);
+ if (unlikely(skb2 == NULL))
+ return -ENOMEM;
+
+ if (skb1->sk)
+ skb_set_owner_w(skb2, skb1->sk);
+
+ /* Looking around. Are we still alive?
+ * OK, link new skb, drop old one */
+
+ skb2->next = skb1->next;
+ *skb_p = skb2;
+ kfree_skb(skb1);
+ skb1 = skb2;
+ }
+ elt++;
+ *trailer = skb1;
+ skb_p = &skb1->next;
+ }
+
+ return elt;
+}
+EXPORT_SYMBOL_GPL(skb_cow_data);
+
+static void sock_rmem_free(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+}
+
+static void skb_set_err_queue(struct sk_buff *skb)
+{
+ /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
+ * So, it is safe to (mis)use it to mark skbs on the error queue.
+ */
+ skb->pkt_type = PACKET_OUTGOING;
+ BUILD_BUG_ON(PACKET_OUTGOING == 0);
+}
+
+/*
+ * Note: We dont mem charge error packets (no sk_forward_alloc changes)
+ */
+int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+ (unsigned int)sk->sk_rcvbuf)
+ return -ENOMEM;
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = sock_rmem_free;
+ atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+ skb_set_err_queue(skb);
+
+ /* before exiting rcu section, make sure dst is refcounted */
+ skb_dst_force(skb);
+
+ skb_queue_tail(&sk->sk_error_queue, skb);
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+ return 0;
+}
+EXPORT_SYMBOL(sock_queue_err_skb);
+
+static bool is_icmp_err_skb(const struct sk_buff *skb)
+{
+ return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
+}
+
+struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
+{
+ struct sk_buff_head *q = &sk->sk_error_queue;
+ struct sk_buff *skb, *skb_next = NULL;
+ bool icmp_next = false;
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ skb = __skb_dequeue(q);
+ if (skb && (skb_next = skb_peek(q))) {
+ icmp_next = is_icmp_err_skb(skb_next);
+ if (icmp_next)
+ sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
+ }
+ spin_unlock_irqrestore(&q->lock, flags);
+
+ if (is_icmp_err_skb(skb) && !icmp_next)
+ sk->sk_err = 0;
+
+ if (skb_next)
+ sk->sk_error_report(sk);
+
+ return skb;
+}
+EXPORT_SYMBOL(sock_dequeue_err_skb);
+
+/**
+ * skb_clone_sk - create clone of skb, and take reference to socket
+ * @skb: the skb to clone
+ *
+ * This function creates a clone of a buffer that holds a reference on
+ * sk_refcnt. Buffers created via this function are meant to be
+ * returned using sock_queue_err_skb, or free via kfree_skb.
+ *
+ * When passing buffers allocated with this function to sock_queue_err_skb
+ * it is necessary to wrap the call with sock_hold/sock_put in order to
+ * prevent the socket from being released prior to being enqueued on
+ * the sk_error_queue.
+ */
+struct sk_buff *skb_clone_sk(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct sk_buff *clone;
+
+ if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (!clone) {
+ sock_put(sk);
+ return NULL;
+ }
+
+ clone->sk = sk;
+ clone->destructor = sock_efree;
+
+ return clone;
+}
+EXPORT_SYMBOL(skb_clone_sk);
+
+static void __skb_complete_tx_timestamp(struct sk_buff *skb,
+ struct sock *sk,
+ int tstype,
+ bool opt_stats)
+{
+ struct sock_exterr_skb *serr;
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+ serr->ee.ee_info = tstype;
+ serr->opt_stats = opt_stats;
+ serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
+ if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
+ serr->ee.ee_data = skb_shinfo(skb)->tskey;
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM)
+ serr->ee.ee_data -= sk->sk_tskey;
+ }
+
+ err = sock_queue_err_skb(sk, skb);
+
+ if (err)
+ kfree_skb(skb);
+}
+
+static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
+{
+ bool ret;
+
+ if (likely(sysctl_tstamp_allow_data || tsonly))
+ return true;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ ret = sk->sk_socket && sk->sk_socket->file &&
+ file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
+ read_unlock_bh(&sk->sk_callback_lock);
+ return ret;
+}
+
+void skb_complete_tx_timestamp(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps)
+{
+ struct sock *sk = skb->sk;
+
+ if (!skb_may_tx_timestamp(sk, false))
+ goto err;
+
+ /* Take a reference to prevent skb_orphan() from freeing the socket,
+ * but only if the socket refcount is not zero.
+ */
+ if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
+ *skb_hwtstamps(skb) = *hwtstamps;
+ __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
+ sock_put(sk);
+ return;
+ }
+
+err:
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+
+void __skb_tstamp_tx(struct sk_buff *orig_skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk, int tstype)
+{
+ struct sk_buff *skb;
+ bool tsonly, opt_stats = false;
+
+ if (!sk)
+ return;
+
+ if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
+ skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
+ return;
+
+ tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
+ if (!skb_may_tx_timestamp(sk, tsonly))
+ return;
+
+ if (tsonly) {
+#ifdef CONFIG_INET
+ if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
+ sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM) {
+ skb = tcp_get_timestamping_opt_stats(sk);
+ opt_stats = true;
+ } else
+#endif
+ skb = alloc_skb(0, GFP_ATOMIC);
+ } else {
+ skb = skb_clone(orig_skb, GFP_ATOMIC);
+ }
+ if (!skb)
+ return;
+
+ if (tsonly) {
+ skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
+ SKBTX_ANY_TSTAMP;
+ skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
+ }
+
+ if (hwtstamps)
+ *skb_hwtstamps(skb) = *hwtstamps;
+ else
+ skb->tstamp = ktime_get_real();
+
+ __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
+}
+EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
+
+void skb_tstamp_tx(struct sk_buff *orig_skb,
+ struct skb_shared_hwtstamps *hwtstamps)
+{
+ return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk,
+ SCM_TSTAMP_SND);
+}
+EXPORT_SYMBOL_GPL(skb_tstamp_tx);
+
+void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
+{
+ struct sock *sk = skb->sk;
+ struct sock_exterr_skb *serr;
+ int err = 1;
+
+ skb->wifi_acked_valid = 1;
+ skb->wifi_acked = acked;
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
+
+ /* Take a reference to prevent skb_orphan() from freeing the socket,
+ * but only if the socket refcount is not zero.
+ */
+ if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
+ err = sock_queue_err_skb(sk, skb);
+ sock_put(sk);
+ }
+ if (err)
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
+
+/**
+ * skb_partial_csum_set - set up and verify partial csum values for packet
+ * @skb: the skb to set
+ * @start: the number of bytes after skb->data to start checksumming.
+ * @off: the offset from start to place the checksum.
+ *
+ * For untrusted partially-checksummed packets, we need to make sure the values
+ * for skb->csum_start and skb->csum_offset are valid so we don't oops.
+ *
+ * This function checks and sets those values and skb->ip_summed: if this
+ * returns false you should drop the packet.
+ */
+bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
+{
+ u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
+ u32 csum_start = skb_headroom(skb) + (u32)start;
+
+ if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) {
+ net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
+ start, off, skb_headroom(skb), skb_headlen(skb));
+ return false;
+ }
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = csum_start;
+ skb->csum_offset = off;
+ skb_set_transport_header(skb, start);
+ return true;
+}
+EXPORT_SYMBOL_GPL(skb_partial_csum_set);
+
+static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
+ unsigned int max)
+{
+ if (skb_headlen(skb) >= len)
+ return 0;
+
+ /* If we need to pullup then pullup to the max, so we
+ * won't need to do it again.
+ */
+ if (max > skb->len)
+ max = skb->len;
+
+ if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
+ return -ENOMEM;
+
+ if (skb_headlen(skb) < len)
+ return -EPROTO;
+
+ return 0;
+}
+
+#define MAX_TCP_HDR_LEN (15 * 4)
+
+static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
+ typeof(IPPROTO_IP) proto,
+ unsigned int off)
+{
+ switch (proto) {
+ int err;
+
+ case IPPROTO_TCP:
+ err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
+ off + MAX_TCP_HDR_LEN);
+ if (!err && !skb_partial_csum_set(skb, off,
+ offsetof(struct tcphdr,
+ check)))
+ err = -EPROTO;
+ return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
+
+ case IPPROTO_UDP:
+ err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
+ off + sizeof(struct udphdr));
+ if (!err && !skb_partial_csum_set(skb, off,
+ offsetof(struct udphdr,
+ check)))
+ err = -EPROTO;
+ return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
+ }
+
+ return ERR_PTR(-EPROTO);
+}
+
+/* This value should be large enough to cover a tagged ethernet header plus
+ * maximally sized IP and TCP or UDP headers.
+ */
+#define MAX_IP_HDR_LEN 128
+
+static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
+{
+ unsigned int off;
+ bool fragment;
+ __sum16 *csum;
+ int err;
+
+ fragment = false;
+
+ err = skb_maybe_pull_tail(skb,
+ sizeof(struct iphdr),
+ MAX_IP_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF))
+ fragment = true;
+
+ off = ip_hdrlen(skb);
+
+ err = -EPROTO;
+
+ if (fragment)
+ goto out;
+
+ csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
+ if (IS_ERR(csum))
+ return PTR_ERR(csum);
+
+ if (recalculate)
+ *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr,
+ skb->len - off,
+ ip_hdr(skb)->protocol, 0);
+ err = 0;
+
+out:
+ return err;
+}
+
+/* This value should be large enough to cover a tagged ethernet header plus
+ * an IPv6 header, all options, and a maximal TCP or UDP header.
+ */
+#define MAX_IPV6_HDR_LEN 256
+
+#define OPT_HDR(type, skb, off) \
+ (type *)(skb_network_header(skb) + (off))
+
+static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
+{
+ int err;
+ u8 nexthdr;
+ unsigned int off;
+ unsigned int len;
+ bool fragment;
+ bool done;
+ __sum16 *csum;
+
+ fragment = false;
+ done = false;
+
+ off = sizeof(struct ipv6hdr);
+
+ err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+
+ len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
+ while (off <= len && !done) {
+ switch (nexthdr) {
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_ROUTING: {
+ struct ipv6_opt_hdr *hp;
+
+ err = skb_maybe_pull_tail(skb,
+ off +
+ sizeof(struct ipv6_opt_hdr),
+ MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
+ nexthdr = hp->nexthdr;
+ off += ipv6_optlen(hp);
+ break;
+ }
+ case IPPROTO_AH: {
+ struct ip_auth_hdr *hp;
+
+ err = skb_maybe_pull_tail(skb,
+ off +
+ sizeof(struct ip_auth_hdr),
+ MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ hp = OPT_HDR(struct ip_auth_hdr, skb, off);
+ nexthdr = hp->nexthdr;
+ off += ipv6_authlen(hp);
+ break;
+ }
+ case IPPROTO_FRAGMENT: {
+ struct frag_hdr *hp;
+
+ err = skb_maybe_pull_tail(skb,
+ off +
+ sizeof(struct frag_hdr),
+ MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ hp = OPT_HDR(struct frag_hdr, skb, off);
+
+ if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
+ fragment = true;
+
+ nexthdr = hp->nexthdr;
+ off += sizeof(struct frag_hdr);
+ break;
+ }
+ default:
+ done = true;
+ break;
+ }
+ }
+
+ err = -EPROTO;
+
+ if (!done || fragment)
+ goto out;
+
+ csum = skb_checksum_setup_ip(skb, nexthdr, off);
+ if (IS_ERR(csum))
+ return PTR_ERR(csum);
+
+ if (recalculate)
+ *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len - off, nexthdr, 0);
+ err = 0;
+
+out:
+ return err;
+}
+
+/**
+ * skb_checksum_setup - set up partial checksum offset
+ * @skb: the skb to set up
+ * @recalculate: if true the pseudo-header checksum will be recalculated
+ */
+int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
+{
+ int err;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ err = skb_checksum_setup_ipv4(skb, recalculate);
+ break;
+
+ case htons(ETH_P_IPV6):
+ err = skb_checksum_setup_ipv6(skb, recalculate);
+ break;
+
+ default:
+ err = -EPROTO;
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(skb_checksum_setup);
+
+/**
+ * skb_checksum_maybe_trim - maybe trims the given skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ *
+ * Checks whether the given skb has data beyond the given transport length.
+ * If so, returns a cloned skb trimmed to this transport length.
+ * Otherwise returns the provided skb. Returns NULL in error cases
+ * (e.g. transport_len exceeds skb length or out-of-memory).
+ *
+ * Caller needs to set the skb transport header and free any returned skb if it
+ * differs from the provided skb.
+ */
+static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
+ unsigned int transport_len)
+{
+ struct sk_buff *skb_chk;
+ unsigned int len = skb_transport_offset(skb) + transport_len;
+ int ret;
+
+ if (skb->len < len)
+ return NULL;
+ else if (skb->len == len)
+ return skb;
+
+ skb_chk = skb_clone(skb, GFP_ATOMIC);
+ if (!skb_chk)
+ return NULL;
+
+ ret = pskb_trim_rcsum(skb_chk, len);
+ if (ret) {
+ kfree_skb(skb_chk);
+ return NULL;
+ }
+
+ return skb_chk;
+}
+
+/**
+ * skb_checksum_trimmed - validate checksum of an skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ * @skb_chkf: checksum function to use
+ *
+ * Applies the given checksum function skb_chkf to the provided skb.
+ * Returns a checked and maybe trimmed skb. Returns NULL on error.
+ *
+ * If the skb has data beyond the given transport length, then a
+ * trimmed & cloned skb is checked and returned.
+ *
+ * Caller needs to set the skb transport header and free any returned skb if it
+ * differs from the provided skb.
+ */
+struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
+ unsigned int transport_len,
+ __sum16(*skb_chkf)(struct sk_buff *skb))
+{
+ struct sk_buff *skb_chk;
+ unsigned int offset = skb_transport_offset(skb);
+ __sum16 ret;
+
+ skb_chk = skb_checksum_maybe_trim(skb, transport_len);
+ if (!skb_chk)
+ goto err;
+
+ if (!pskb_may_pull(skb_chk, offset))
+ goto err;
+
+ skb_pull_rcsum(skb_chk, offset);
+ ret = skb_chkf(skb_chk);
+ skb_push_rcsum(skb_chk, offset);
+
+ if (ret)
+ goto err;
+
+ return skb_chk;
+
+err:
+ if (skb_chk && skb_chk != skb)
+ kfree_skb(skb_chk);
+
+ return NULL;
+
+}
+EXPORT_SYMBOL(skb_checksum_trimmed);
+
+void __skb_warn_lro_forwarding(const struct sk_buff *skb)
+{
+ net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
+ skb->dev->name);
+}
+EXPORT_SYMBOL(__skb_warn_lro_forwarding);
+
+void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
+{
+ if (head_stolen) {
+ skb_release_head_state(skb);
+ kmem_cache_free(skbuff_head_cache, skb);
+ } else {
+ __kfree_skb(skb);
+ }
+}
+EXPORT_SYMBOL(kfree_skb_partial);
+
+/**
+ * skb_try_coalesce - try to merge skb to prior one
+ * @to: prior buffer
+ * @from: buffer to add
+ * @fragstolen: pointer to boolean
+ * @delta_truesize: how much more was allocated than was requested
+ */
+bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
+ bool *fragstolen, int *delta_truesize)
+{
+ struct skb_shared_info *to_shinfo, *from_shinfo;
+ int i, delta, len = from->len;
+
+ *fragstolen = false;
+
+ if (skb_cloned(to))
+ return false;
+
+ if (len <= skb_tailroom(to)) {
+ if (len)
+ BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
+ *delta_truesize = 0;
+ return true;
+ }
+
+ to_shinfo = skb_shinfo(to);
+ from_shinfo = skb_shinfo(from);
+ if (to_shinfo->frag_list || from_shinfo->frag_list)
+ return false;
+ if (skb_zcopy(to) || skb_zcopy(from))
+ return false;
+
+ if (skb_headlen(from) != 0) {
+ struct page *page;
+ unsigned int offset;
+
+ if (to_shinfo->nr_frags +
+ from_shinfo->nr_frags >= MAX_SKB_FRAGS)
+ return false;
+
+ if (skb_head_is_locked(from))
+ return false;
+
+ delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+
+ skb_fill_page_desc(to, to_shinfo->nr_frags,
+ page, offset, skb_headlen(from));
+ *fragstolen = true;
+ } else {
+ if (to_shinfo->nr_frags +
+ from_shinfo->nr_frags > MAX_SKB_FRAGS)
+ return false;
+
+ delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
+ }
+
+ WARN_ON_ONCE(delta < len);
+
+ memcpy(to_shinfo->frags + to_shinfo->nr_frags,
+ from_shinfo->frags,
+ from_shinfo->nr_frags * sizeof(skb_frag_t));
+ to_shinfo->nr_frags += from_shinfo->nr_frags;
+
+ if (!skb_cloned(from))
+ from_shinfo->nr_frags = 0;
+
+ /* if the skb is not cloned this does nothing
+ * since we set nr_frags to 0.
+ */
+ for (i = 0; i < from_shinfo->nr_frags; i++)
+ __skb_frag_ref(&from_shinfo->frags[i]);
+
+ to->truesize += delta;
+ to->len += len;
+ to->data_len += len;
+
+ *delta_truesize = delta;
+ return true;
+}
+EXPORT_SYMBOL(skb_try_coalesce);
+
+/**
+ * skb_scrub_packet - scrub an skb
+ *
+ * @skb: buffer to clean
+ * @xnet: packet is crossing netns
+ *
+ * skb_scrub_packet can be used after encapsulating or decapsulting a packet
+ * into/from a tunnel. Some information have to be cleared during these
+ * operations.
+ * skb_scrub_packet can also be used to clean a skb before injecting it in
+ * another namespace (@xnet == true). We have to clear all information in the
+ * skb that could impact namespace isolation.
+ */
+void skb_scrub_packet(struct sk_buff *skb, bool xnet)
+{
+ skb->pkt_type = PACKET_HOST;
+ skb->skb_iif = 0;
+ skb->ignore_df = 0;
+ skb_dst_drop(skb);
+ secpath_reset(skb);
+ nf_reset(skb);
+ nf_reset_trace(skb);
+
+#ifdef CONFIG_NET_SWITCHDEV
+ skb->offload_fwd_mark = 0;
+ skb->offload_mr_fwd_mark = 0;
+#endif
+
+ if (!xnet)
+ return;
+
+ ipvs_reset(skb);
+ skb->mark = 0;
+ skb->tstamp = 0;
+}
+EXPORT_SYMBOL_GPL(skb_scrub_packet);
+
+/**
+ * skb_gso_transport_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_transport_seglen is used to determine the real size of the
+ * individual segments, including Layer4 headers (TCP/UDP).
+ *
+ * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
+ */
+static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ unsigned int thlen = 0;
+
+ if (skb->encapsulation) {
+ thlen = skb_inner_transport_header(skb) -
+ skb_transport_header(skb);
+
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ thlen += inner_tcp_hdrlen(skb);
+ } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
+ thlen = tcp_hdrlen(skb);
+ } else if (unlikely(skb_is_gso_sctp(skb))) {
+ thlen = sizeof(struct sctphdr);
+ } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
+ thlen = sizeof(struct udphdr);
+ }
+ /* UFO sets gso_size to the size of the fragmentation
+ * payload, i.e. the size of the L4 (UDP) header is already
+ * accounted for.
+ */
+ return thlen + shinfo->gso_size;
+}
+
+/**
+ * skb_gso_network_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_network_seglen is used to determine the real size of the
+ * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
+ *
+ * The MAC/L2 header is not accounted for.
+ */
+static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
+{
+ unsigned int hdr_len = skb_transport_header(skb) -
+ skb_network_header(skb);
+
+ return hdr_len + skb_gso_transport_seglen(skb);
+}
+
+/**
+ * skb_gso_mac_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_mac_seglen is used to determine the real size of the
+ * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
+ * headers (TCP/UDP).
+ */
+static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
+{
+ unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+
+ return hdr_len + skb_gso_transport_seglen(skb);
+}
+
+/**
+ * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
+ *
+ * There are a couple of instances where we have a GSO skb, and we
+ * want to determine what size it would be after it is segmented.
+ *
+ * We might want to check:
+ * - L3+L4+payload size (e.g. IP forwarding)
+ * - L2+L3+L4+payload size (e.g. sanity check before passing to driver)
+ *
+ * This is a helper to do that correctly considering GSO_BY_FRAGS.
+ *
+ * @seg_len: The segmented length (from skb_gso_*_seglen). In the
+ * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS].
+ *
+ * @max_len: The maximum permissible length.
+ *
+ * Returns true if the segmented length <= max length.
+ */
+static inline bool skb_gso_size_check(const struct sk_buff *skb,
+ unsigned int seg_len,
+ unsigned int max_len) {
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ const struct sk_buff *iter;
+
+ if (shinfo->gso_size != GSO_BY_FRAGS)
+ return seg_len <= max_len;
+
+ /* Undo this so we can re-use header sizes */
+ seg_len -= GSO_BY_FRAGS;
+
+ skb_walk_frags(skb, iter) {
+ if (seg_len + skb_headlen(iter) > max_len)
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
+ *
+ * @skb: GSO skb
+ * @mtu: MTU to validate against
+ *
+ * skb_gso_validate_network_len validates if a given skb will fit a
+ * wanted MTU once split. It considers L3 headers, L4 headers, and the
+ * payload.
+ */
+bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
+{
+ return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
+}
+EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);
+
+/**
+ * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
+ *
+ * @skb: GSO skb
+ * @len: length to validate against
+ *
+ * skb_gso_validate_mac_len validates if a given skb will fit a wanted
+ * length once split, including L2, L3 and L4 headers and the payload.
+ */
+bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)
+{
+ return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len);
+}
+EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
+
+static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
+{
+ int mac_len, meta_len;
+ void *meta;
+
+ if (skb_cow(skb, skb_headroom(skb)) < 0) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ mac_len = skb->data - skb_mac_header(skb);
+ if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
+ memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
+ mac_len - VLAN_HLEN - ETH_TLEN);
+ }
+
+ meta_len = skb_metadata_len(skb);
+ if (meta_len) {
+ meta = skb_metadata_end(skb) - meta_len;
+ memmove(meta + VLAN_HLEN, meta, meta_len);
+ }
+
+ skb->mac_header += VLAN_HLEN;
+ return skb;
+}
+
+struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
+{
+ struct vlan_hdr *vhdr;
+ u16 vlan_tci;
+
+ if (unlikely(skb_vlan_tag_present(skb))) {
+ /* vlan_tci is already set-up so leave this for another time */
+ return skb;
+ }
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ goto err_free;
+ /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
+ if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
+ goto err_free;
+
+ vhdr = (struct vlan_hdr *)skb->data;
+ vlan_tci = ntohs(vhdr->h_vlan_TCI);
+ __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
+
+ skb_pull_rcsum(skb, VLAN_HLEN);
+ vlan_set_encap_proto(skb, vhdr);
+
+ skb = skb_reorder_vlan_header(skb);
+ if (unlikely(!skb))
+ goto err_free;
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb_reset_mac_len(skb);
+
+ return skb;
+
+err_free:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(skb_vlan_untag);
+
+int skb_ensure_writable(struct sk_buff *skb, int write_len)
+{
+ if (!pskb_may_pull(skb, write_len))
+ return -ENOMEM;
+
+ if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
+ return 0;
+
+ return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(skb_ensure_writable);
+
+/* remove VLAN header from packet and update csum accordingly.
+ * expects a non skb_vlan_tag_present skb with a vlan tag payload
+ */
+int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
+{
+ struct vlan_hdr *vhdr;
+ int offset = skb->data - skb_mac_header(skb);
+ int err;
+
+ if (WARN_ONCE(offset,
+ "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
+ offset)) {
+ return -EINVAL;
+ }
+
+ err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
+ if (unlikely(err))
+ return err;
+
+ skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
+
+ vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
+ *vlan_tci = ntohs(vhdr->h_vlan_TCI);
+
+ memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
+ __skb_pull(skb, VLAN_HLEN);
+
+ vlan_set_encap_proto(skb, vhdr);
+ skb->mac_header += VLAN_HLEN;
+
+ if (skb_network_offset(skb) < ETH_HLEN)
+ skb_set_network_header(skb, ETH_HLEN);
+
+ skb_reset_mac_len(skb);
+
+ return err;
+}
+EXPORT_SYMBOL(__skb_vlan_pop);
+
+/* Pop a vlan tag either from hwaccel or from payload.
+ * Expects skb->data at mac header.
+ */
+int skb_vlan_pop(struct sk_buff *skb)
+{
+ u16 vlan_tci;
+ __be16 vlan_proto;
+ int err;
+
+ if (likely(skb_vlan_tag_present(skb))) {
+ skb->vlan_tci = 0;
+ } else {
+ if (unlikely(!eth_type_vlan(skb->protocol)))
+ return 0;
+
+ err = __skb_vlan_pop(skb, &vlan_tci);
+ if (err)
+ return err;
+ }
+ /* move next vlan tag to hw accel tag */
+ if (likely(!eth_type_vlan(skb->protocol)))
+ return 0;
+
+ vlan_proto = skb->protocol;
+ err = __skb_vlan_pop(skb, &vlan_tci);
+ if (unlikely(err))
+ return err;
+
+ __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+ return 0;
+}
+EXPORT_SYMBOL(skb_vlan_pop);
+
+/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
+ * Expects skb->data at mac header.
+ */
+int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+{
+ if (skb_vlan_tag_present(skb)) {
+ int offset = skb->data - skb_mac_header(skb);
+ int err;
+
+ if (WARN_ONCE(offset,
+ "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
+ offset)) {
+ return -EINVAL;
+ }
+
+ err = __vlan_insert_tag(skb, skb->vlan_proto,
+ skb_vlan_tag_get(skb));
+ if (err)
+ return err;
+
+ skb->protocol = skb->vlan_proto;
+ skb->mac_len += VLAN_HLEN;
+
+ skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
+ }
+ __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+ return 0;
+}
+EXPORT_SYMBOL(skb_vlan_push);
+
+/**
+ * alloc_skb_with_frags - allocate skb with page frags
+ *
+ * @header_len: size of linear part
+ * @data_len: needed length in frags
+ * @max_page_order: max page order desired.
+ * @errcode: pointer to error code if any
+ * @gfp_mask: allocation mask
+ *
+ * This can be used to allocate a paged skb, given a maximal order for frags.
+ */
+struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
+ unsigned long data_len,
+ int max_page_order,
+ int *errcode,
+ gfp_t gfp_mask)
+{
+ int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+ unsigned long chunk;
+ struct sk_buff *skb;
+ struct page *page;
+ int i;
+
+ *errcode = -EMSGSIZE;
+ /* Note this test could be relaxed, if we succeed to allocate
+ * high order pages...
+ */
+ if (npages > MAX_SKB_FRAGS)
+ return NULL;
+
+ *errcode = -ENOBUFS;
+ skb = alloc_skb(header_len, gfp_mask);
+ if (!skb)
+ return NULL;
+
+ skb->truesize += npages << PAGE_SHIFT;
+
+ for (i = 0; npages > 0; i++) {
+ int order = max_page_order;
+
+ while (order) {
+ if (npages >= 1 << order) {
+ page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_COMP |
+ __GFP_NOWARN,
+ order);
+ if (page)
+ goto fill_page;
+ /* Do not retry other high order allocations */
+ order = 1;
+ max_page_order = 0;
+ }
+ order--;
+ }
+ page = alloc_page(gfp_mask);
+ if (!page)
+ goto failure;
+fill_page:
+ chunk = min_t(unsigned long, data_len,
+ PAGE_SIZE << order);
+ skb_fill_page_desc(skb, i, page, 0, chunk);
+ data_len -= chunk;
+ npages -= 1 << order;
+ }
+ return skb;
+
+failure:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(alloc_skb_with_frags);
+
+/* carve out the first off bytes from skb when off < headlen */
+static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
+ const int headlen, gfp_t gfp_mask)
+{
+ int i;
+ int size = skb_end_offset(skb);
+ int new_hlen = headlen - off;
+ u8 *data;
+
+ size = SKB_DATA_ALIGN(size);
+
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+ data = kmalloc_reserve(size +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+ gfp_mask, NUMA_NO_NODE, NULL);
+ if (!data)
+ return -ENOMEM;
+
+ size = SKB_WITH_OVERHEAD(ksize(data));
+
+ /* Copy real data, and all frags */
+ skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
+ skb->len -= off;
+
+ memcpy((struct skb_shared_info *)(data + size),
+ skb_shinfo(skb),
+ offsetof(struct skb_shared_info,
+ frags[skb_shinfo(skb)->nr_frags]));
+ if (skb_cloned(skb)) {
+ /* drop the old head gracefully */
+ if (skb_orphan_frags(skb, gfp_mask)) {
+ kfree(data);
+ return -ENOMEM;
+ }
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_frag_ref(skb, i);
+ if (skb_has_frag_list(skb))
+ skb_clone_fraglist(skb);
+ skb_release_data(skb);
+ } else {
+ /* we can reuse existing recount- all we did was
+ * relocate values
+ */
+ skb_free_head(skb);
+ }
+
+ skb->head = data;
+ skb->data = data;
+ skb->head_frag = 0;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ skb->end = size;
+#else
+ skb->end = skb->head + size;
+#endif
+ skb_set_tail_pointer(skb, skb_headlen(skb));
+ skb_headers_offset_update(skb, 0);
+ skb->cloned = 0;
+ skb->hdr_len = 0;
+ skb->nohdr = 0;
+ atomic_set(&skb_shinfo(skb)->dataref, 1);
+
+ return 0;
+}
+
+static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
+
+/* carve out the first eat bytes from skb's frag_list. May recurse into
+ * pskb_carve()
+ */
+static int pskb_carve_frag_list(struct sk_buff *skb,
+ struct skb_shared_info *shinfo, int eat,
+ gfp_t gfp_mask)
+{
+ struct sk_buff *list = shinfo->frag_list;
+ struct sk_buff *clone = NULL;
+ struct sk_buff *insp = NULL;
+
+ do {
+ if (!list) {
+ pr_err("Not enough bytes to eat. Want %d\n", eat);
+ return -EFAULT;
+ }
+ if (list->len <= eat) {
+ /* Eaten as whole. */
+ eat -= list->len;
+ list = list->next;
+ insp = list;
+ } else {
+ /* Eaten partially. */
+ if (skb_shared(list)) {
+ clone = skb_clone(list, gfp_mask);
+ if (!clone)
+ return -ENOMEM;
+ insp = list->next;
+ list = clone;
+ } else {
+ /* This may be pulled without problems. */
+ insp = list;
+ }
+ if (pskb_carve(list, eat, gfp_mask) < 0) {
+ kfree_skb(clone);
+ return -ENOMEM;
+ }
+ break;
+ }
+ } while (eat);
+
+ /* Free pulled out fragments. */
+ while ((list = shinfo->frag_list) != insp) {
+ shinfo->frag_list = list->next;
+ consume_skb(list);
+ }
+ /* And insert new clone at head. */
+ if (clone) {
+ clone->next = list;
+ shinfo->frag_list = clone;
+ }
+ return 0;
+}
+
+/* carve off first len bytes from skb. Split line (off) is in the
+ * non-linear part of skb
+ */
+static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
+ int pos, gfp_t gfp_mask)
+{
+ int i, k = 0;
+ int size = skb_end_offset(skb);
+ u8 *data;
+ const int nfrags = skb_shinfo(skb)->nr_frags;
+ struct skb_shared_info *shinfo;
+
+ size = SKB_DATA_ALIGN(size);
+
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+ data = kmalloc_reserve(size +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+ gfp_mask, NUMA_NO_NODE, NULL);
+ if (!data)
+ return -ENOMEM;
+
+ size = SKB_WITH_OVERHEAD(ksize(data));
+
+ memcpy((struct skb_shared_info *)(data + size),
+ skb_shinfo(skb), offsetof(struct skb_shared_info,
+ frags[skb_shinfo(skb)->nr_frags]));
+ if (skb_orphan_frags(skb, gfp_mask)) {
+ kfree(data);
+ return -ENOMEM;
+ }
+ shinfo = (struct skb_shared_info *)(data + size);
+ for (i = 0; i < nfrags; i++) {
+ int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (pos + fsize > off) {
+ shinfo->frags[k] = skb_shinfo(skb)->frags[i];
+
+ if (pos < off) {
+ /* Split frag.
+ * We have two variants in this case:
+ * 1. Move all the frag to the second
+ * part, if it is possible. F.e.
+ * this approach is mandatory for TUX,
+ * where splitting is expensive.
+ * 2. Split is accurately. We make this.
+ */
+ shinfo->frags[0].page_offset += off - pos;
+ skb_frag_size_sub(&shinfo->frags[0], off - pos);
+ }
+ skb_frag_ref(skb, i);
+ k++;
+ }
+ pos += fsize;
+ }
+ shinfo->nr_frags = k;
+ if (skb_has_frag_list(skb))
+ skb_clone_fraglist(skb);
+
+ /* split line is in frag list */
+ if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) {
+ /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
+ if (skb_has_frag_list(skb))
+ kfree_skb_list(skb_shinfo(skb)->frag_list);
+ kfree(data);
+ return -ENOMEM;
+ }
+ skb_release_data(skb);
+
+ skb->head = data;
+ skb->head_frag = 0;
+ skb->data = data;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ skb->end = size;
+#else
+ skb->end = skb->head + size;
+#endif
+ skb_reset_tail_pointer(skb);
+ skb_headers_offset_update(skb, 0);
+ skb->cloned = 0;
+ skb->hdr_len = 0;
+ skb->nohdr = 0;
+ skb->len -= off;
+ skb->data_len = skb->len;
+ atomic_set(&skb_shinfo(skb)->dataref, 1);
+ return 0;
+}
+
+/* remove len bytes from the beginning of the skb */
+static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
+{
+ int headlen = skb_headlen(skb);
+
+ if (len < headlen)
+ return pskb_carve_inside_header(skb, len, headlen, gfp);
+ else
+ return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
+}
+
+/* Extract to_copy bytes starting at off from skb, and return this in
+ * a new skb
+ */
+struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
+ int to_copy, gfp_t gfp)
+{
+ struct sk_buff *clone = skb_clone(skb, gfp);
+
+ if (!clone)
+ return NULL;
+
+ if (pskb_carve(clone, off, gfp) < 0 ||
+ pskb_trim(clone, to_copy)) {
+ kfree_skb(clone);
+ return NULL;
+ }
+ return clone;
+}
+EXPORT_SYMBOL(pskb_extract);
+
+/**
+ * skb_condense - try to get rid of fragments/frag_list if possible
+ * @skb: buffer
+ *
+ * Can be used to save memory before skb is added to a busy queue.
+ * If packet has bytes in frags and enough tail room in skb->head,
+ * pull all of them, so that we can free the frags right now and adjust
+ * truesize.
+ * Notes:
+ * We do not reallocate skb->head thus can not fail.
+ * Caller must re-evaluate skb->truesize if needed.
+ */
+void skb_condense(struct sk_buff *skb)
+{
+ if (skb->data_len) {
+ if (skb->data_len > skb->end - skb->tail ||
+ skb_cloned(skb))
+ return;
+
+ /* Nice, we can free page frag(s) right now */
+ __pskb_pull_tail(skb, skb->data_len);
+ }
+ /* At this point, skb->truesize might be over estimated,
+ * because skb had a fragment, and fragments do not tell
+ * their truesize.
+ * When we pulled its content into skb->head, fragment
+ * was freed, but __pskb_pull_tail() could not possibly
+ * adjust skb->truesize, not knowing the frag truesize.
+ */
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+}
diff --git a/net/core/sock.c b/net/core/sock.c
new file mode 100644
index 000000000..79f085df5
--- /dev/null
+++ b/net/core/sock.c
@@ -0,0 +1,3534 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic socket support routines. Memory allocators, socket lock/release
+ * handler for protocols to use and generic option handler.
+ *
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Alan Cox, <A.Cox@swansea.ac.uk>
+ *
+ * Fixes:
+ * Alan Cox : Numerous verify_area() problems
+ * Alan Cox : Connecting on a connecting socket
+ * now returns an error for tcp.
+ * Alan Cox : sock->protocol is set correctly.
+ * and is not sometimes left as 0.
+ * Alan Cox : connect handles icmp errors on a
+ * connect properly. Unfortunately there
+ * is a restart syscall nasty there. I
+ * can't match BSD without hacking the C
+ * library. Ideas urgently sought!
+ * Alan Cox : Disallow bind() to addresses that are
+ * not ours - especially broadcast ones!!
+ * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
+ * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
+ * instead they leave that for the DESTROY timer.
+ * Alan Cox : Clean up error flag in accept
+ * Alan Cox : TCP ack handling is buggy, the DESTROY timer
+ * was buggy. Put a remove_sock() in the handler
+ * for memory when we hit 0. Also altered the timer
+ * code. The ACK stuff can wait and needs major
+ * TCP layer surgery.
+ * Alan Cox : Fixed TCP ack bug, removed remove sock
+ * and fixed timer/inet_bh race.
+ * Alan Cox : Added zapped flag for TCP
+ * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
+ * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
+ * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
+ * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
+ * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
+ * Rick Sladkey : Relaxed UDP rules for matching packets.
+ * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
+ * Pauline Middelink : identd support
+ * Alan Cox : Fixed connect() taking signals I think.
+ * Alan Cox : SO_LINGER supported
+ * Alan Cox : Error reporting fixes
+ * Anonymous : inet_create tidied up (sk->reuse setting)
+ * Alan Cox : inet sockets don't set sk->type!
+ * Alan Cox : Split socket option code
+ * Alan Cox : Callbacks
+ * Alan Cox : Nagle flag for Charles & Johannes stuff
+ * Alex : Removed restriction on inet fioctl
+ * Alan Cox : Splitting INET from NET core
+ * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
+ * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
+ * Alan Cox : Split IP from generic code
+ * Alan Cox : New kfree_skbmem()
+ * Alan Cox : Make SO_DEBUG superuser only.
+ * Alan Cox : Allow anyone to clear SO_DEBUG
+ * (compatibility fix)
+ * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
+ * Alan Cox : Allocator for a socket is settable.
+ * Alan Cox : SO_ERROR includes soft errors.
+ * Alan Cox : Allow NULL arguments on some SO_ opts
+ * Alan Cox : Generic socket allocation to make hooks
+ * easier (suggested by Craig Metz).
+ * Michael Pall : SO_ERROR returns positive errno again
+ * Steve Whitehouse: Added default destructor to free
+ * protocol private data.
+ * Steve Whitehouse: Added various other default routines
+ * common to several socket families.
+ * Chris Evans : Call suser() check last on F_SETOWN
+ * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
+ * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
+ * Andi Kleen : Fix write_space callback
+ * Chris Evans : Security fixes - signedness again
+ * Arnaldo C. Melo : cleanups, use skb_queue_purge
+ *
+ * To Fix:
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <asm/unaligned.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/errqueue.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/poll.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/user_namespace.h>
+#include <linux/static_key.h>
+#include <linux/memcontrol.h>
+#include <linux/prefetch.h>
+
+#include <linux/uaccess.h>
+
+#include <linux/netdevice.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/net_namespace.h>
+#include <net/request_sock.h>
+#include <net/sock.h>
+#include <linux/net_tstamp.h>
+#include <net/xfrm.h>
+#include <linux/ipsec.h>
+#include <net/cls_cgroup.h>
+#include <net/netprio_cgroup.h>
+#include <linux/sock_diag.h>
+
+#include <linux/filter.h>
+#include <net/sock_reuseport.h>
+
+#include <trace/events/sock.h>
+
+#include <net/tcp.h>
+#include <net/busy_poll.h>
+
+static DEFINE_MUTEX(proto_list_mutex);
+static LIST_HEAD(proto_list);
+
+static void sock_inuse_add(struct net *net, int val);
+
+/**
+ * sk_ns_capable - General socket capability test
+ * @sk: Socket to use a capability on or through
+ * @user_ns: The user namespace of the capability to use
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in the user
+ * namespace @user_ns.
+ */
+bool sk_ns_capable(const struct sock *sk,
+ struct user_namespace *user_ns, int cap)
+{
+ return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
+ ns_capable(user_ns, cap);
+}
+EXPORT_SYMBOL(sk_ns_capable);
+
+/**
+ * sk_capable - Socket global capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The global capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in all user
+ * namespaces.
+ */
+bool sk_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, &init_user_ns, cap);
+}
+EXPORT_SYMBOL(sk_capable);
+
+/**
+ * sk_net_capable - Network namespace socket capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was created
+ * and the current process has the capability @cap over the network namespace
+ * the socket is a member of.
+ */
+bool sk_net_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
+}
+EXPORT_SYMBOL(sk_net_capable);
+
+/*
+ * Each address family might have different locking rules, so we have
+ * one slock key per address family and separate keys for internal and
+ * userspace sockets.
+ */
+static struct lock_class_key af_family_keys[AF_MAX];
+static struct lock_class_key af_family_kern_keys[AF_MAX];
+static struct lock_class_key af_family_slock_keys[AF_MAX];
+static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
+
+/*
+ * Make lock validator output more readable. (we pre-construct these
+ * strings build-time, so that runtime initialization of socket
+ * locks is fast):
+ */
+
+#define _sock_locks(x) \
+ x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
+ x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
+ x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
+ x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
+ x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
+ x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
+ x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
+ x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
+ x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
+ x "27" , x "28" , x "AF_CAN" , \
+ x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
+ x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
+ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
+ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
+ x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
+ x "AF_MAX"
+
+static const char *const af_family_key_strings[AF_MAX+1] = {
+ _sock_locks("sk_lock-")
+};
+static const char *const af_family_slock_key_strings[AF_MAX+1] = {
+ _sock_locks("slock-")
+};
+static const char *const af_family_clock_key_strings[AF_MAX+1] = {
+ _sock_locks("clock-")
+};
+
+static const char *const af_family_kern_key_strings[AF_MAX+1] = {
+ _sock_locks("k-sk_lock-")
+};
+static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
+ _sock_locks("k-slock-")
+};
+static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
+ _sock_locks("k-clock-")
+};
+static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
+ _sock_locks("rlock-")
+};
+static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
+ _sock_locks("wlock-")
+};
+static const char *const af_family_elock_key_strings[AF_MAX+1] = {
+ _sock_locks("elock-")
+};
+
+/*
+ * sk_callback_lock and sk queues locking rules are per-address-family,
+ * so split the lock classes by using a per-AF key:
+ */
+static struct lock_class_key af_callback_keys[AF_MAX];
+static struct lock_class_key af_rlock_keys[AF_MAX];
+static struct lock_class_key af_wlock_keys[AF_MAX];
+static struct lock_class_key af_elock_keys[AF_MAX];
+static struct lock_class_key af_kern_callback_keys[AF_MAX];
+
+/* Run time adjustable parameters. */
+__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
+EXPORT_SYMBOL(sysctl_wmem_max);
+__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
+EXPORT_SYMBOL(sysctl_rmem_max);
+__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
+__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
+
+/* Maximal space eaten by iovec or ancillary data plus some space */
+int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
+EXPORT_SYMBOL(sysctl_optmem_max);
+
+int sysctl_tstamp_allow_data __read_mostly = 1;
+
+DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
+EXPORT_SYMBOL_GPL(memalloc_socks_key);
+
+/**
+ * sk_set_memalloc - sets %SOCK_MEMALLOC
+ * @sk: socket to set it on
+ *
+ * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
+ * It's the responsibility of the admin to adjust min_free_kbytes
+ * to meet the requirements
+ */
+void sk_set_memalloc(struct sock *sk)
+{
+ sock_set_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation |= __GFP_MEMALLOC;
+ static_branch_inc(&memalloc_socks_key);
+}
+EXPORT_SYMBOL_GPL(sk_set_memalloc);
+
+void sk_clear_memalloc(struct sock *sk)
+{
+ sock_reset_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation &= ~__GFP_MEMALLOC;
+ static_branch_dec(&memalloc_socks_key);
+
+ /*
+ * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
+ * progress of swapping. SOCK_MEMALLOC may be cleared while
+ * it has rmem allocations due to the last swapfile being deactivated
+ * but there is a risk that the socket is unusable due to exceeding
+ * the rmem limits. Reclaim the reserves and obey rmem limits again.
+ */
+ sk_mem_reclaim(sk);
+}
+EXPORT_SYMBOL_GPL(sk_clear_memalloc);
+
+int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+ unsigned int noreclaim_flag;
+
+ /* these should have been dropped before queueing */
+ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
+
+ noreclaim_flag = memalloc_noreclaim_save();
+ ret = sk->sk_backlog_rcv(sk, skb);
+ memalloc_noreclaim_restore(noreclaim_flag);
+
+ return ret;
+}
+EXPORT_SYMBOL(__sk_backlog_rcv);
+
+static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
+{
+ struct timeval tv;
+
+ if (optlen < sizeof(tv))
+ return -EINVAL;
+ if (copy_from_user(&tv, optval, sizeof(tv)))
+ return -EFAULT;
+ if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
+ return -EDOM;
+
+ if (tv.tv_sec < 0) {
+ static int warned __read_mostly;
+
+ *timeo_p = 0;
+ if (warned < 10 && net_ratelimit()) {
+ warned++;
+ pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
+ __func__, current->comm, task_pid_nr(current));
+ }
+ return 0;
+ }
+ *timeo_p = MAX_SCHEDULE_TIMEOUT;
+ if (tv.tv_sec == 0 && tv.tv_usec == 0)
+ return 0;
+ if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
+ *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
+ return 0;
+}
+
+static void sock_warn_obsolete_bsdism(const char *name)
+{
+ static int warned;
+ static char warncomm[TASK_COMM_LEN];
+ if (strcmp(warncomm, current->comm) && warned < 5) {
+ strcpy(warncomm, current->comm);
+ pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
+ warncomm, name);
+ warned++;
+ }
+}
+
+static bool sock_needs_netstamp(const struct sock *sk)
+{
+ switch (sk->sk_family) {
+ case AF_UNSPEC:
+ case AF_UNIX:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
+{
+ if (sk->sk_flags & flags) {
+ sk->sk_flags &= ~flags;
+ if (sock_needs_netstamp(sk) &&
+ !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
+ net_disable_timestamp();
+ }
+}
+
+
+int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ unsigned long flags;
+ struct sk_buff_head *list = &sk->sk_receive_queue;
+
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
+ atomic_inc(&sk->sk_drops);
+ trace_sock_rcvqueue_full(sk, skb);
+ return -ENOMEM;
+ }
+
+ if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
+ atomic_inc(&sk->sk_drops);
+ return -ENOBUFS;
+ }
+
+ skb->dev = NULL;
+ skb_set_owner_r(skb, sk);
+
+ /* we escape from rcu protected region, make sure we dont leak
+ * a norefcounted dst
+ */
+ skb_dst_force(skb);
+
+ spin_lock_irqsave(&list->lock, flags);
+ sock_skb_set_dropcount(sk, skb);
+ __skb_queue_tail(list, skb);
+ spin_unlock_irqrestore(&list->lock, flags);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ return 0;
+}
+EXPORT_SYMBOL(__sock_queue_rcv_skb);
+
+int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+
+ err = sk_filter(sk, skb);
+ if (err)
+ return err;
+
+ return __sock_queue_rcv_skb(sk, skb);
+}
+EXPORT_SYMBOL(sock_queue_rcv_skb);
+
+int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
+ const int nested, unsigned int trim_cap, bool refcounted)
+{
+ int rc = NET_RX_SUCCESS;
+
+ if (sk_filter_trim_cap(sk, skb, trim_cap))
+ goto discard_and_relse;
+
+ skb->dev = NULL;
+
+ if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
+ atomic_inc(&sk->sk_drops);
+ goto discard_and_relse;
+ }
+ if (nested)
+ bh_lock_sock_nested(sk);
+ else
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ /*
+ * trylock + unlock semantics:
+ */
+ mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
+
+ rc = sk_backlog_rcv(sk, skb);
+
+ mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+ } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
+ bh_unlock_sock(sk);
+ atomic_inc(&sk->sk_drops);
+ goto discard_and_relse;
+ }
+
+ bh_unlock_sock(sk);
+out:
+ if (refcounted)
+ sock_put(sk);
+ return rc;
+discard_and_relse:
+ kfree_skb(skb);
+ goto out;
+}
+EXPORT_SYMBOL(__sk_receive_skb);
+
+struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ sk_tx_queue_clear(sk);
+ sk->sk_dst_pending_confirm = 0;
+ RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
+ dst_release(dst);
+ return NULL;
+ }
+
+ return dst;
+}
+EXPORT_SYMBOL(__sk_dst_check);
+
+struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+
+ if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ sk_dst_reset(sk);
+ dst_release(dst);
+ return NULL;
+ }
+
+ return dst;
+}
+EXPORT_SYMBOL(sk_dst_check);
+
+static int sock_setbindtodevice(struct sock *sk, char __user *optval,
+ int optlen)
+{
+ int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+ struct net *net = sock_net(sk);
+ char devname[IFNAMSIZ];
+ int index;
+
+ /* Sorry... */
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_RAW))
+ goto out;
+
+ ret = -EINVAL;
+ if (optlen < 0)
+ goto out;
+
+ /* Bind this socket to a particular device like "eth0",
+ * as specified in the passed interface name. If the
+ * name is "" or the option length is zero the socket
+ * is not bound.
+ */
+ if (optlen > IFNAMSIZ - 1)
+ optlen = IFNAMSIZ - 1;
+ memset(devname, 0, sizeof(devname));
+
+ ret = -EFAULT;
+ if (copy_from_user(devname, optval, optlen))
+ goto out;
+
+ index = 0;
+ if (devname[0] != '\0') {
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(net, devname);
+ if (dev)
+ index = dev->ifindex;
+ rcu_read_unlock();
+ ret = -ENODEV;
+ if (!dev)
+ goto out;
+ }
+
+ lock_sock(sk);
+ sk->sk_bound_dev_if = index;
+ sk_dst_reset(sk);
+ release_sock(sk);
+
+ ret = 0;
+
+out:
+#endif
+
+ return ret;
+}
+
+static int sock_getbindtodevice(struct sock *sk, char __user *optval,
+ int __user *optlen, int len)
+{
+ int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+ struct net *net = sock_net(sk);
+ char devname[IFNAMSIZ];
+
+ if (sk->sk_bound_dev_if == 0) {
+ len = 0;
+ goto zero;
+ }
+
+ ret = -EINVAL;
+ if (len < IFNAMSIZ)
+ goto out;
+
+ ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
+ if (ret)
+ goto out;
+
+ len = strlen(devname) + 1;
+
+ ret = -EFAULT;
+ if (copy_to_user(optval, devname, len))
+ goto out;
+
+zero:
+ ret = -EFAULT;
+ if (put_user(len, optlen))
+ goto out;
+
+ ret = 0;
+
+out:
+#endif
+
+ return ret;
+}
+
+static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
+{
+ if (valbool)
+ sock_set_flag(sk, bit);
+ else
+ sock_reset_flag(sk, bit);
+}
+
+bool sk_mc_loop(struct sock *sk)
+{
+ if (dev_recursion_level())
+ return false;
+ if (!sk)
+ return true;
+ switch (sk->sk_family) {
+ case AF_INET:
+ return inet_sk(sk)->mc_loop;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ return inet6_sk(sk)->mc_loop;
+#endif
+ }
+ WARN_ON_ONCE(1);
+ return true;
+}
+EXPORT_SYMBOL(sk_mc_loop);
+
+/*
+ * This is meant for all protocols to use and covers goings on
+ * at the socket level. Everything here is generic.
+ */
+
+int sock_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock_txtime sk_txtime;
+ struct sock *sk = sock->sk;
+ int val;
+ int valbool;
+ struct linger ling;
+ int ret = 0;
+
+ /*
+ * Options without arguments
+ */
+
+ if (optname == SO_BINDTODEVICE)
+ return sock_setbindtodevice(sk, optval, optlen);
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ valbool = val ? 1 : 0;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case SO_DEBUG:
+ if (val && !capable(CAP_NET_ADMIN))
+ ret = -EACCES;
+ else
+ sock_valbool_flag(sk, SOCK_DBG, valbool);
+ break;
+ case SO_REUSEADDR:
+ sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
+ break;
+ case SO_REUSEPORT:
+ sk->sk_reuseport = valbool;
+ break;
+ case SO_TYPE:
+ case SO_PROTOCOL:
+ case SO_DOMAIN:
+ case SO_ERROR:
+ ret = -ENOPROTOOPT;
+ break;
+ case SO_DONTROUTE:
+ sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
+ sk_dst_reset(sk);
+ break;
+ case SO_BROADCAST:
+ sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
+ break;
+ case SO_SNDBUF:
+ /* Don't error on this BSD doesn't and if you think
+ * about it this is right. Otherwise apps have to
+ * play 'guess the biggest size' games. RCVBUF/SNDBUF
+ * are treated in BSD as hints
+ */
+ val = min_t(u32, val, sysctl_wmem_max);
+set_sndbuf:
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
+ /* Wake up sending tasks if we upped the value. */
+ sk->sk_write_space(sk);
+ break;
+
+ case SO_SNDBUFFORCE:
+ if (!capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+ goto set_sndbuf;
+
+ case SO_RCVBUF:
+ /* Don't error on this BSD doesn't and if you think
+ * about it this is right. Otherwise apps have to
+ * play 'guess the biggest size' games. RCVBUF/SNDBUF
+ * are treated in BSD as hints
+ */
+ val = min_t(u32, val, sysctl_rmem_max);
+set_rcvbuf:
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ /*
+ * We double it on the way in to account for
+ * "struct sk_buff" etc. overhead. Applications
+ * assume that the SO_RCVBUF setting they make will
+ * allow that much actual data to be received on that
+ * socket.
+ *
+ * Applications are unaware that "struct sk_buff" and
+ * other overheads allocate from the receive buffer
+ * during socket buffer allocation.
+ *
+ * And after considering the possible alternatives,
+ * returning the value we actually used in getsockopt
+ * is the most desirable behavior.
+ */
+ sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
+ break;
+
+ case SO_RCVBUFFORCE:
+ if (!capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+ goto set_rcvbuf;
+
+ case SO_KEEPALIVE:
+ if (sk->sk_prot->keepalive)
+ sk->sk_prot->keepalive(sk, valbool);
+ sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
+ break;
+
+ case SO_OOBINLINE:
+ sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
+ break;
+
+ case SO_NO_CHECK:
+ sk->sk_no_check_tx = valbool;
+ break;
+
+ case SO_PRIORITY:
+ if ((val >= 0 && val <= 6) ||
+ ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ sk->sk_priority = val;
+ else
+ ret = -EPERM;
+ break;
+
+ case SO_LINGER:
+ if (optlen < sizeof(ling)) {
+ ret = -EINVAL; /* 1003.1g */
+ break;
+ }
+ if (copy_from_user(&ling, optval, sizeof(ling))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (!ling.l_onoff)
+ sock_reset_flag(sk, SOCK_LINGER);
+ else {
+#if (BITS_PER_LONG == 32)
+ if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
+ sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
+ else
+#endif
+ sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
+ sock_set_flag(sk, SOCK_LINGER);
+ }
+ break;
+
+ case SO_BSDCOMPAT:
+ sock_warn_obsolete_bsdism("setsockopt");
+ break;
+
+ case SO_PASSCRED:
+ if (valbool)
+ set_bit(SOCK_PASSCRED, &sock->flags);
+ else
+ clear_bit(SOCK_PASSCRED, &sock->flags);
+ break;
+
+ case SO_TIMESTAMP:
+ case SO_TIMESTAMPNS:
+ if (valbool) {
+ if (optname == SO_TIMESTAMP)
+ sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+ else
+ sock_set_flag(sk, SOCK_RCVTSTAMPNS);
+ sock_set_flag(sk, SOCK_RCVTSTAMP);
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ } else {
+ sock_reset_flag(sk, SOCK_RCVTSTAMP);
+ sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+ }
+ break;
+
+ case SO_TIMESTAMPING:
+ if (val & ~SOF_TIMESTAMPING_MASK) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (val & SOF_TIMESTAMPING_OPT_ID &&
+ !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM) {
+ if ((1 << sk->sk_state) &
+ (TCPF_CLOSE | TCPF_LISTEN)) {
+ ret = -EINVAL;
+ break;
+ }
+ sk->sk_tskey = tcp_sk(sk)->snd_una;
+ } else {
+ sk->sk_tskey = 0;
+ }
+ }
+
+ if (val & SOF_TIMESTAMPING_OPT_STATS &&
+ !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ sk->sk_tsflags = val;
+ if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
+ sock_enable_timestamp(sk,
+ SOCK_TIMESTAMPING_RX_SOFTWARE);
+ else
+ sock_disable_timestamp(sk,
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
+ break;
+
+ case SO_RCVLOWAT:
+ if (val < 0)
+ val = INT_MAX;
+ if (sock->ops->set_rcvlowat)
+ ret = sock->ops->set_rcvlowat(sk, val);
+ else
+ sk->sk_rcvlowat = val ? : 1;
+ break;
+
+ case SO_RCVTIMEO:
+ ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
+ break;
+
+ case SO_SNDTIMEO:
+ ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
+ break;
+
+ case SO_ATTACH_FILTER:
+ ret = -EINVAL;
+ if (optlen == sizeof(struct sock_fprog)) {
+ struct sock_fprog fprog;
+
+ ret = -EFAULT;
+ if (copy_from_user(&fprog, optval, sizeof(fprog)))
+ break;
+
+ ret = sk_attach_filter(&fprog, sk);
+ }
+ break;
+
+ case SO_ATTACH_BPF:
+ ret = -EINVAL;
+ if (optlen == sizeof(u32)) {
+ u32 ufd;
+
+ ret = -EFAULT;
+ if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ break;
+
+ ret = sk_attach_bpf(ufd, sk);
+ }
+ break;
+
+ case SO_ATTACH_REUSEPORT_CBPF:
+ ret = -EINVAL;
+ if (optlen == sizeof(struct sock_fprog)) {
+ struct sock_fprog fprog;
+
+ ret = -EFAULT;
+ if (copy_from_user(&fprog, optval, sizeof(fprog)))
+ break;
+
+ ret = sk_reuseport_attach_filter(&fprog, sk);
+ }
+ break;
+
+ case SO_ATTACH_REUSEPORT_EBPF:
+ ret = -EINVAL;
+ if (optlen == sizeof(u32)) {
+ u32 ufd;
+
+ ret = -EFAULT;
+ if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ break;
+
+ ret = sk_reuseport_attach_bpf(ufd, sk);
+ }
+ break;
+
+ case SO_DETACH_FILTER:
+ ret = sk_detach_filter(sk);
+ break;
+
+ case SO_LOCK_FILTER:
+ if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
+ ret = -EPERM;
+ else
+ sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
+ break;
+
+ case SO_PASSSEC:
+ if (valbool)
+ set_bit(SOCK_PASSSEC, &sock->flags);
+ else
+ clear_bit(SOCK_PASSSEC, &sock->flags);
+ break;
+ case SO_MARK:
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ ret = -EPERM;
+ else
+ sk->sk_mark = val;
+ break;
+
+ case SO_RXQ_OVFL:
+ sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
+ break;
+
+ case SO_WIFI_STATUS:
+ sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
+ break;
+
+ case SO_PEEK_OFF:
+ if (sock->ops->set_peek_off)
+ ret = sock->ops->set_peek_off(sk, val);
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_NOFCS:
+ sock_valbool_flag(sk, SOCK_NOFCS, valbool);
+ break;
+
+ case SO_SELECT_ERR_QUEUE:
+ sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
+ break;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ /* allow unprivileged users to decrease the value */
+ if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ else {
+ if (val < 0)
+ ret = -EINVAL;
+ else
+ WRITE_ONCE(sk->sk_ll_usec, val);
+ }
+ break;
+#endif
+
+ case SO_MAX_PACING_RATE:
+ if (val != ~0U)
+ cmpxchg(&sk->sk_pacing_status,
+ SK_PACING_NONE,
+ SK_PACING_NEEDED);
+ sk->sk_max_pacing_rate = val;
+ sk->sk_pacing_rate = min(sk->sk_pacing_rate,
+ sk->sk_max_pacing_rate);
+ break;
+
+ case SO_INCOMING_CPU:
+ WRITE_ONCE(sk->sk_incoming_cpu, val);
+ break;
+
+ case SO_CNX_ADVICE:
+ if (val == 1)
+ dst_negative_advice(sk);
+ break;
+
+ case SO_ZEROCOPY:
+ if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
+ if (sk->sk_protocol != IPPROTO_TCP)
+ ret = -ENOTSUPP;
+ } else if (sk->sk_family != PF_RDS) {
+ ret = -ENOTSUPP;
+ }
+ if (!ret) {
+ if (val < 0 || val > 1)
+ ret = -EINVAL;
+ else
+ sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
+ }
+ break;
+
+ case SO_TXTIME:
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ } else if (optlen != sizeof(struct sock_txtime)) {
+ ret = -EINVAL;
+ } else if (copy_from_user(&sk_txtime, optval,
+ sizeof(struct sock_txtime))) {
+ ret = -EFAULT;
+ } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
+ ret = -EINVAL;
+ } else {
+ sock_valbool_flag(sk, SOCK_TXTIME, true);
+ sk->sk_clockid = sk_txtime.clockid;
+ sk->sk_txtime_deadline_mode =
+ !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
+ sk->sk_txtime_report_errors =
+ !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
+ }
+ break;
+
+ default:
+ ret = -ENOPROTOOPT;
+ break;
+ }
+ release_sock(sk);
+ return ret;
+}
+EXPORT_SYMBOL(sock_setsockopt);
+
+static const struct cred *sk_get_peer_cred(struct sock *sk)
+{
+ const struct cred *cred;
+
+ spin_lock(&sk->sk_peer_lock);
+ cred = get_cred(sk->sk_peer_cred);
+ spin_unlock(&sk->sk_peer_lock);
+
+ return cred;
+}
+
+static void cred_to_ucred(struct pid *pid, const struct cred *cred,
+ struct ucred *ucred)
+{
+ ucred->pid = pid_vnr(pid);
+ ucred->uid = ucred->gid = -1;
+ if (cred) {
+ struct user_namespace *current_ns = current_user_ns();
+
+ ucred->uid = from_kuid_munged(current_ns, cred->euid);
+ ucred->gid = from_kgid_munged(current_ns, cred->egid);
+ }
+}
+
+static int groups_to_user(gid_t __user *dst, const struct group_info *src)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ int i;
+
+ for (i = 0; i < src->ngroups; i++)
+ if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
+ return -EFAULT;
+
+ return 0;
+}
+
+int sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+
+ union {
+ int val;
+ u64 val64;
+ struct linger ling;
+ struct timeval tm;
+ struct sock_txtime txtime;
+ } v;
+
+ int lv = sizeof(int);
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ memset(&v, 0, sizeof(v));
+
+ switch (optname) {
+ case SO_DEBUG:
+ v.val = sock_flag(sk, SOCK_DBG);
+ break;
+
+ case SO_DONTROUTE:
+ v.val = sock_flag(sk, SOCK_LOCALROUTE);
+ break;
+
+ case SO_BROADCAST:
+ v.val = sock_flag(sk, SOCK_BROADCAST);
+ break;
+
+ case SO_SNDBUF:
+ v.val = sk->sk_sndbuf;
+ break;
+
+ case SO_RCVBUF:
+ v.val = sk->sk_rcvbuf;
+ break;
+
+ case SO_REUSEADDR:
+ v.val = sk->sk_reuse;
+ break;
+
+ case SO_REUSEPORT:
+ v.val = sk->sk_reuseport;
+ break;
+
+ case SO_KEEPALIVE:
+ v.val = sock_flag(sk, SOCK_KEEPOPEN);
+ break;
+
+ case SO_TYPE:
+ v.val = sk->sk_type;
+ break;
+
+ case SO_PROTOCOL:
+ v.val = sk->sk_protocol;
+ break;
+
+ case SO_DOMAIN:
+ v.val = sk->sk_family;
+ break;
+
+ case SO_ERROR:
+ v.val = -sock_error(sk);
+ if (v.val == 0)
+ v.val = xchg(&sk->sk_err_soft, 0);
+ break;
+
+ case SO_OOBINLINE:
+ v.val = sock_flag(sk, SOCK_URGINLINE);
+ break;
+
+ case SO_NO_CHECK:
+ v.val = sk->sk_no_check_tx;
+ break;
+
+ case SO_PRIORITY:
+ v.val = sk->sk_priority;
+ break;
+
+ case SO_LINGER:
+ lv = sizeof(v.ling);
+ v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
+ v.ling.l_linger = sk->sk_lingertime / HZ;
+ break;
+
+ case SO_BSDCOMPAT:
+ sock_warn_obsolete_bsdism("getsockopt");
+ break;
+
+ case SO_TIMESTAMP:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
+ !sock_flag(sk, SOCK_RCVTSTAMPNS);
+ break;
+
+ case SO_TIMESTAMPNS:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
+ break;
+
+ case SO_TIMESTAMPING:
+ v.val = sk->sk_tsflags;
+ break;
+
+ case SO_RCVTIMEO:
+ lv = sizeof(struct timeval);
+ if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
+ v.tm.tv_sec = 0;
+ v.tm.tv_usec = 0;
+ } else {
+ v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
+ v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
+ }
+ break;
+
+ case SO_SNDTIMEO:
+ lv = sizeof(struct timeval);
+ if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
+ v.tm.tv_sec = 0;
+ v.tm.tv_usec = 0;
+ } else {
+ v.tm.tv_sec = sk->sk_sndtimeo / HZ;
+ v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
+ }
+ break;
+
+ case SO_RCVLOWAT:
+ v.val = sk->sk_rcvlowat;
+ break;
+
+ case SO_SNDLOWAT:
+ v.val = 1;
+ break;
+
+ case SO_PASSCRED:
+ v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
+ break;
+
+ case SO_PEERCRED:
+ {
+ struct ucred peercred;
+ if (len > sizeof(peercred))
+ len = sizeof(peercred);
+
+ spin_lock(&sk->sk_peer_lock);
+ cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
+ spin_unlock(&sk->sk_peer_lock);
+
+ if (copy_to_user(optval, &peercred, len))
+ return -EFAULT;
+ goto lenout;
+ }
+
+ case SO_PEERGROUPS:
+ {
+ const struct cred *cred;
+ int ret, n;
+
+ cred = sk_get_peer_cred(sk);
+ if (!cred)
+ return -ENODATA;
+
+ n = cred->group_info->ngroups;
+ if (len < n * sizeof(gid_t)) {
+ len = n * sizeof(gid_t);
+ put_cred(cred);
+ return put_user(len, optlen) ? -EFAULT : -ERANGE;
+ }
+ len = n * sizeof(gid_t);
+
+ ret = groups_to_user((gid_t __user *)optval, cred->group_info);
+ put_cred(cred);
+ if (ret)
+ return ret;
+ goto lenout;
+ }
+
+ case SO_PEERNAME:
+ {
+ char address[128];
+
+ lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
+ if (lv < 0)
+ return -ENOTCONN;
+ if (lv < len)
+ return -EINVAL;
+ if (copy_to_user(optval, address, len))
+ return -EFAULT;
+ goto lenout;
+ }
+
+ /* Dubious BSD thing... Probably nobody even uses it, but
+ * the UNIX standard wants it for whatever reason... -DaveM
+ */
+ case SO_ACCEPTCONN:
+ v.val = sk->sk_state == TCP_LISTEN;
+ break;
+
+ case SO_PASSSEC:
+ v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
+ break;
+
+ case SO_PEERSEC:
+ return security_socket_getpeersec_stream(sock, optval, optlen, len);
+
+ case SO_MARK:
+ v.val = sk->sk_mark;
+ break;
+
+ case SO_RXQ_OVFL:
+ v.val = sock_flag(sk, SOCK_RXQ_OVFL);
+ break;
+
+ case SO_WIFI_STATUS:
+ v.val = sock_flag(sk, SOCK_WIFI_STATUS);
+ break;
+
+ case SO_PEEK_OFF:
+ if (!sock->ops->set_peek_off)
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_peek_off;
+ break;
+ case SO_NOFCS:
+ v.val = sock_flag(sk, SOCK_NOFCS);
+ break;
+
+ case SO_BINDTODEVICE:
+ return sock_getbindtodevice(sk, optval, optlen, len);
+
+ case SO_GET_FILTER:
+ len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
+ if (len < 0)
+ return len;
+
+ goto lenout;
+
+ case SO_LOCK_FILTER:
+ v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
+ break;
+
+ case SO_BPF_EXTENSIONS:
+ v.val = bpf_tell_extensions();
+ break;
+
+ case SO_SELECT_ERR_QUEUE:
+ v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
+ break;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ v.val = sk->sk_ll_usec;
+ break;
+#endif
+
+ case SO_MAX_PACING_RATE:
+ v.val = sk->sk_max_pacing_rate;
+ break;
+
+ case SO_INCOMING_CPU:
+ v.val = READ_ONCE(sk->sk_incoming_cpu);
+ break;
+
+ case SO_MEMINFO:
+ {
+ u32 meminfo[SK_MEMINFO_VARS];
+
+ sk_get_meminfo(sk, meminfo);
+
+ len = min_t(unsigned int, len, sizeof(meminfo));
+ if (copy_to_user(optval, &meminfo, len))
+ return -EFAULT;
+
+ goto lenout;
+ }
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_INCOMING_NAPI_ID:
+ v.val = READ_ONCE(sk->sk_napi_id);
+
+ /* aggregate non-NAPI IDs down to 0 */
+ if (v.val < MIN_NAPI_ID)
+ v.val = 0;
+
+ break;
+#endif
+
+ case SO_COOKIE:
+ lv = sizeof(u64);
+ if (len < lv)
+ return -EINVAL;
+ v.val64 = sock_gen_cookie(sk);
+ break;
+
+ case SO_ZEROCOPY:
+ v.val = sock_flag(sk, SOCK_ZEROCOPY);
+ break;
+
+ case SO_TXTIME:
+ lv = sizeof(v.txtime);
+ v.txtime.clockid = sk->sk_clockid;
+ v.txtime.flags |= sk->sk_txtime_deadline_mode ?
+ SOF_TXTIME_DEADLINE_MODE : 0;
+ v.txtime.flags |= sk->sk_txtime_report_errors ?
+ SOF_TXTIME_REPORT_ERRORS : 0;
+ break;
+
+ default:
+ /* We implement the SO_SNDLOWAT etc to not be settable
+ * (1003.1g 7).
+ */
+ return -ENOPROTOOPT;
+ }
+
+ if (len > lv)
+ len = lv;
+ if (copy_to_user(optval, &v, len))
+ return -EFAULT;
+lenout:
+ if (put_user(len, optlen))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * Initialize an sk_lock.
+ *
+ * (We also register the sk_lock with the lock validator.)
+ */
+static inline void sock_lock_init(struct sock *sk)
+{
+ if (sk->sk_kern_sock)
+ sock_lock_init_class_and_name(
+ sk,
+ af_family_kern_slock_key_strings[sk->sk_family],
+ af_family_kern_slock_keys + sk->sk_family,
+ af_family_kern_key_strings[sk->sk_family],
+ af_family_kern_keys + sk->sk_family);
+ else
+ sock_lock_init_class_and_name(
+ sk,
+ af_family_slock_key_strings[sk->sk_family],
+ af_family_slock_keys + sk->sk_family,
+ af_family_key_strings[sk->sk_family],
+ af_family_keys + sk->sk_family);
+}
+
+/*
+ * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
+ * even temporarly, because of RCU lookups. sk_node should also be left as is.
+ * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
+ */
+static void sock_copy(struct sock *nsk, const struct sock *osk)
+{
+#ifdef CONFIG_SECURITY_NETWORK
+ void *sptr = nsk->sk_security;
+#endif
+ memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
+
+ memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
+ osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+
+#ifdef CONFIG_SECURITY_NETWORK
+ nsk->sk_security = sptr;
+ security_sk_clone(osk, nsk);
+#endif
+}
+
+static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
+ int family)
+{
+ struct sock *sk;
+ struct kmem_cache *slab;
+
+ slab = prot->slab;
+ if (slab != NULL) {
+ sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
+ if (!sk)
+ return sk;
+ if (priority & __GFP_ZERO)
+ sk_prot_clear_nulls(sk, prot->obj_size);
+ } else
+ sk = kmalloc(prot->obj_size, priority);
+
+ if (sk != NULL) {
+ if (security_sk_alloc(sk, family, priority))
+ goto out_free;
+
+ if (!try_module_get(prot->owner))
+ goto out_free_sec;
+ sk_tx_queue_clear(sk);
+ }
+
+ return sk;
+
+out_free_sec:
+ security_sk_free(sk);
+out_free:
+ if (slab != NULL)
+ kmem_cache_free(slab, sk);
+ else
+ kfree(sk);
+ return NULL;
+}
+
+static void sk_prot_free(struct proto *prot, struct sock *sk)
+{
+ struct kmem_cache *slab;
+ struct module *owner;
+
+ owner = prot->owner;
+ slab = prot->slab;
+
+ cgroup_sk_free(&sk->sk_cgrp_data);
+ mem_cgroup_sk_free(sk);
+ security_sk_free(sk);
+ if (slab != NULL)
+ kmem_cache_free(slab, sk);
+ else
+ kfree(sk);
+ module_put(owner);
+}
+
+/**
+ * sk_alloc - All socket objects are allocated here
+ * @net: the applicable net namespace
+ * @family: protocol family
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ * @prot: struct proto associated with this new sock instance
+ * @kern: is this to be a kernel socket?
+ */
+struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
+ struct proto *prot, int kern)
+{
+ struct sock *sk;
+
+ sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
+ if (sk) {
+ sk->sk_family = family;
+ /*
+ * See comment in struct sock definition to understand
+ * why we need sk_prot_creator -acme
+ */
+ sk->sk_prot = sk->sk_prot_creator = prot;
+ sk->sk_kern_sock = kern;
+ sock_lock_init(sk);
+ sk->sk_net_refcnt = kern ? 0 : 1;
+ if (likely(sk->sk_net_refcnt)) {
+ get_net(net);
+ sock_inuse_add(net, 1);
+ }
+
+ sock_net_set(sk, net);
+ refcount_set(&sk->sk_wmem_alloc, 1);
+
+ mem_cgroup_sk_alloc(sk);
+ cgroup_sk_alloc(&sk->sk_cgrp_data);
+ sock_update_classid(&sk->sk_cgrp_data);
+ sock_update_netprioidx(&sk->sk_cgrp_data);
+ sk_tx_queue_clear(sk);
+ }
+
+ return sk;
+}
+EXPORT_SYMBOL(sk_alloc);
+
+/* Sockets having SOCK_RCU_FREE will call this function after one RCU
+ * grace period. This is the case for UDP sockets and TCP listeners.
+ */
+static void __sk_destruct(struct rcu_head *head)
+{
+ struct sock *sk = container_of(head, struct sock, sk_rcu);
+ struct sk_filter *filter;
+
+ if (sk->sk_destruct)
+ sk->sk_destruct(sk);
+
+ filter = rcu_dereference_check(sk->sk_filter,
+ refcount_read(&sk->sk_wmem_alloc) == 0);
+ if (filter) {
+ sk_filter_uncharge(sk, filter);
+ RCU_INIT_POINTER(sk->sk_filter, NULL);
+ }
+
+ sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
+
+ if (atomic_read(&sk->sk_omem_alloc))
+ pr_debug("%s: optmem leakage (%d bytes) detected\n",
+ __func__, atomic_read(&sk->sk_omem_alloc));
+
+ if (sk->sk_frag.page) {
+ put_page(sk->sk_frag.page);
+ sk->sk_frag.page = NULL;
+ }
+
+ /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
+ put_cred(sk->sk_peer_cred);
+ put_pid(sk->sk_peer_pid);
+
+ if (likely(sk->sk_net_refcnt))
+ put_net(sock_net(sk));
+ sk_prot_free(sk->sk_prot_creator, sk);
+}
+
+void sk_destruct(struct sock *sk)
+{
+ bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
+
+ if (rcu_access_pointer(sk->sk_reuseport_cb)) {
+ reuseport_detach_sock(sk);
+ use_call_rcu = true;
+ }
+
+ if (use_call_rcu)
+ call_rcu(&sk->sk_rcu, __sk_destruct);
+ else
+ __sk_destruct(&sk->sk_rcu);
+}
+
+static void __sk_free(struct sock *sk)
+{
+ if (likely(sk->sk_net_refcnt))
+ sock_inuse_add(sock_net(sk), -1);
+
+ if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
+ sock_diag_broadcast_destroy(sk);
+ else
+ sk_destruct(sk);
+}
+
+void sk_free(struct sock *sk)
+{
+ /*
+ * We subtract one from sk_wmem_alloc and can know if
+ * some packets are still in some tx queue.
+ * If not null, sock_wfree() will call __sk_free(sk) later
+ */
+ if (refcount_dec_and_test(&sk->sk_wmem_alloc))
+ __sk_free(sk);
+}
+EXPORT_SYMBOL(sk_free);
+
+static void sk_init_common(struct sock *sk)
+{
+ skb_queue_head_init(&sk->sk_receive_queue);
+ skb_queue_head_init(&sk->sk_write_queue);
+ skb_queue_head_init(&sk->sk_error_queue);
+
+ rwlock_init(&sk->sk_callback_lock);
+ lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
+ af_rlock_keys + sk->sk_family,
+ af_family_rlock_key_strings[sk->sk_family]);
+ lockdep_set_class_and_name(&sk->sk_write_queue.lock,
+ af_wlock_keys + sk->sk_family,
+ af_family_wlock_key_strings[sk->sk_family]);
+ lockdep_set_class_and_name(&sk->sk_error_queue.lock,
+ af_elock_keys + sk->sk_family,
+ af_family_elock_key_strings[sk->sk_family]);
+ lockdep_set_class_and_name(&sk->sk_callback_lock,
+ af_callback_keys + sk->sk_family,
+ af_family_clock_key_strings[sk->sk_family]);
+}
+
+/**
+ * sk_clone_lock - clone a socket, and lock its clone
+ * @sk: the socket to clone
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
+{
+ struct sock *newsk;
+ bool is_charged = true;
+
+ newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
+ if (newsk != NULL) {
+ struct sk_filter *filter;
+
+ sock_copy(newsk, sk);
+
+ newsk->sk_prot_creator = sk->sk_prot;
+
+ /* SANITY */
+ if (likely(newsk->sk_net_refcnt))
+ get_net(sock_net(newsk));
+ sk_node_init(&newsk->sk_node);
+ sock_lock_init(newsk);
+ bh_lock_sock(newsk);
+ newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
+ newsk->sk_backlog.len = 0;
+
+ atomic_set(&newsk->sk_rmem_alloc, 0);
+ /*
+ * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
+ */
+ refcount_set(&newsk->sk_wmem_alloc, 1);
+ atomic_set(&newsk->sk_omem_alloc, 0);
+ sk_init_common(newsk);
+
+ newsk->sk_dst_cache = NULL;
+ newsk->sk_dst_pending_confirm = 0;
+ newsk->sk_wmem_queued = 0;
+ newsk->sk_forward_alloc = 0;
+ atomic_set(&newsk->sk_drops, 0);
+ newsk->sk_send_head = NULL;
+ newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+ atomic_set(&newsk->sk_zckey, 0);
+
+ sock_reset_flag(newsk, SOCK_DONE);
+
+ /* sk->sk_memcg will be populated at accept() time */
+ newsk->sk_memcg = NULL;
+
+ cgroup_sk_clone(&newsk->sk_cgrp_data);
+
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter != NULL)
+ /* though it's an empty new sock, the charging may fail
+ * if sysctl_optmem_max was changed between creation of
+ * original socket and cloning
+ */
+ is_charged = sk_filter_charge(newsk, filter);
+ RCU_INIT_POINTER(newsk->sk_filter, filter);
+ rcu_read_unlock();
+
+ if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
+ /* We need to make sure that we don't uncharge the new
+ * socket if we couldn't charge it in the first place
+ * as otherwise we uncharge the parent's filter.
+ */
+ if (!is_charged)
+ RCU_INIT_POINTER(newsk->sk_filter, NULL);
+ sk_free_unlock_clone(newsk);
+ newsk = NULL;
+ goto out;
+ }
+ RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
+
+ newsk->sk_err = 0;
+ newsk->sk_err_soft = 0;
+ newsk->sk_priority = 0;
+ newsk->sk_incoming_cpu = raw_smp_processor_id();
+ atomic64_set(&newsk->sk_cookie, 0);
+ if (likely(newsk->sk_net_refcnt))
+ sock_inuse_add(sock_net(newsk), 1);
+
+ /*
+ * Before updating sk_refcnt, we must commit prior changes to memory
+ * (Documentation/RCU/rculist_nulls.txt for details)
+ */
+ smp_wmb();
+ refcount_set(&newsk->sk_refcnt, 2);
+
+ /*
+ * Increment the counter in the same struct proto as the master
+ * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
+ * is the same as sk->sk_prot->socks, as this field was copied
+ * with memcpy).
+ *
+ * This _changes_ the previous behaviour, where
+ * tcp_create_openreq_child always was incrementing the
+ * equivalent to tcp_prot->socks (inet_sock_nr), so this have
+ * to be taken into account in all callers. -acme
+ */
+ sk_refcnt_debug_inc(newsk);
+ sk_set_socket(newsk, NULL);
+ sk_tx_queue_clear(newsk);
+ newsk->sk_wq = NULL;
+
+ if (newsk->sk_prot->sockets_allocated)
+ sk_sockets_allocated_inc(newsk);
+
+ if (sock_needs_netstamp(sk) &&
+ newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+ net_enable_timestamp();
+ }
+out:
+ return newsk;
+}
+EXPORT_SYMBOL_GPL(sk_clone_lock);
+
+void sk_free_unlock_clone(struct sock *sk)
+{
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free() */
+ sk->sk_destruct = NULL;
+ bh_unlock_sock(sk);
+ sk_free(sk);
+}
+EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
+
+void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
+{
+ u32 max_segs = 1;
+
+ sk_dst_set(sk, dst);
+ sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
+ if (sk->sk_route_caps & NETIF_F_GSO)
+ sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
+ sk->sk_route_caps &= ~sk->sk_route_nocaps;
+ if (sk_can_gso(sk)) {
+ if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
+ sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+ } else {
+ sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
+ sk->sk_gso_max_size = dst->dev->gso_max_size;
+ max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
+ }
+ }
+ sk->sk_gso_max_segs = max_segs;
+}
+EXPORT_SYMBOL_GPL(sk_setup_caps);
+
+/*
+ * Simple resource managers for sockets.
+ */
+
+
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ */
+void sock_wfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ unsigned int len = skb->truesize;
+
+ if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
+ /*
+ * Keep a reference on sk_wmem_alloc, this will be released
+ * after sk_write_space() call
+ */
+ WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
+ sk->sk_write_space(sk);
+ len = 1;
+ }
+ /*
+ * if sk_wmem_alloc reaches 0, we must finish what sk_free()
+ * could not do because of in-flight packets
+ */
+ if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
+ __sk_free(sk);
+}
+EXPORT_SYMBOL(sock_wfree);
+
+/* This variant of sock_wfree() is used by TCP,
+ * since it sets SOCK_USE_WRITE_QUEUE.
+ */
+void __sock_wfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
+ __sk_free(sk);
+}
+
+void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
+{
+ skb_orphan(skb);
+ skb->sk = sk;
+#ifdef CONFIG_INET
+ if (unlikely(!sk_fullsock(sk))) {
+ skb->destructor = sock_edemux;
+ sock_hold(sk);
+ return;
+ }
+#endif
+ skb->destructor = sock_wfree;
+ skb_set_hash_from_sk(skb, sk);
+ /*
+ * We used to take a refcount on sk, but following operation
+ * is enough to guarantee sk_free() wont free this sock until
+ * all in-flight packets are completed
+ */
+ refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+}
+EXPORT_SYMBOL(skb_set_owner_w);
+
+/* This helper is used by netem, as it can hold packets in its
+ * delay queue. We want to allow the owner socket to send more
+ * packets, as if they were already TX completed by a typical driver.
+ * But we also want to keep skb->sk set because some packet schedulers
+ * rely on it (sch_fq for example).
+ */
+void skb_orphan_partial(struct sk_buff *skb)
+{
+ if (skb_is_tcp_pure_ack(skb))
+ return;
+
+ if (skb->destructor == sock_wfree
+#ifdef CONFIG_INET
+ || skb->destructor == tcp_wfree
+#endif
+ ) {
+ struct sock *sk = skb->sk;
+
+ if (refcount_inc_not_zero(&sk->sk_refcnt)) {
+ WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
+ skb->destructor = sock_efree;
+ }
+ } else {
+ skb_orphan(skb);
+ }
+}
+EXPORT_SYMBOL(skb_orphan_partial);
+
+/*
+ * Read buffer destructor automatically called from kfree_skb.
+ */
+void sock_rfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ unsigned int len = skb->truesize;
+
+ atomic_sub(len, &sk->sk_rmem_alloc);
+ sk_mem_uncharge(sk, len);
+}
+EXPORT_SYMBOL(sock_rfree);
+
+/*
+ * Buffer destructor for skbs that are not used directly in read or write
+ * path, e.g. for error handler skbs. Automatically called from kfree_skb.
+ */
+void sock_efree(struct sk_buff *skb)
+{
+ sock_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_efree);
+
+kuid_t sock_i_uid(struct sock *sk)
+{
+ kuid_t uid;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
+ read_unlock_bh(&sk->sk_callback_lock);
+ return uid;
+}
+EXPORT_SYMBOL(sock_i_uid);
+
+unsigned long sock_i_ino(struct sock *sk)
+{
+ unsigned long ino;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
+ read_unlock_bh(&sk->sk_callback_lock);
+ return ino;
+}
+EXPORT_SYMBOL(sock_i_ino);
+
+/*
+ * Allocate a skb from the socket's send buffer.
+ */
+struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
+ gfp_t priority)
+{
+ if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+ struct sk_buff *skb = alloc_skb(size, priority);
+ if (skb) {
+ skb_set_owner_w(skb, sk);
+ return skb;
+ }
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(sock_wmalloc);
+
+static void sock_ofree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ atomic_sub(skb->truesize, &sk->sk_omem_alloc);
+}
+
+struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
+ gfp_t priority)
+{
+ struct sk_buff *skb;
+
+ /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
+ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
+ sysctl_optmem_max)
+ return NULL;
+
+ skb = alloc_skb(size, priority);
+ if (!skb)
+ return NULL;
+
+ atomic_add(skb->truesize, &sk->sk_omem_alloc);
+ skb->sk = sk;
+ skb->destructor = sock_ofree;
+ return skb;
+}
+
+/*
+ * Allocate a memory block from the socket's option memory buffer.
+ */
+void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
+{
+ if ((unsigned int)size <= sysctl_optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
+ void *mem;
+ /* First do the add, to avoid the race if kmalloc
+ * might sleep.
+ */
+ atomic_add(size, &sk->sk_omem_alloc);
+ mem = kmalloc(size, priority);
+ if (mem)
+ return mem;
+ atomic_sub(size, &sk->sk_omem_alloc);
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(sock_kmalloc);
+
+/* Free an option memory block. Note, we actually want the inline
+ * here as this allows gcc to detect the nullify and fold away the
+ * condition entirely.
+ */
+static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
+ const bool nullify)
+{
+ if (WARN_ON_ONCE(!mem))
+ return;
+ if (nullify)
+ kzfree(mem);
+ else
+ kfree(mem);
+ atomic_sub(size, &sk->sk_omem_alloc);
+}
+
+void sock_kfree_s(struct sock *sk, void *mem, int size)
+{
+ __sock_kfree_s(sk, mem, size, false);
+}
+EXPORT_SYMBOL(sock_kfree_s);
+
+void sock_kzfree_s(struct sock *sk, void *mem, int size)
+{
+ __sock_kfree_s(sk, mem, size, true);
+}
+EXPORT_SYMBOL(sock_kzfree_s);
+
+/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
+ I think, these locks should be removed for datagram sockets.
+ */
+static long sock_wait_for_wmem(struct sock *sk, long timeo)
+{
+ DEFINE_WAIT(wait);
+
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ for (;;) {
+ if (!timeo)
+ break;
+ if (signal_pending(current))
+ break;
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
+ break;
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ break;
+ if (sk->sk_err)
+ break;
+ timeo = schedule_timeout(timeo);
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ return timeo;
+}
+
+
+/*
+ * Generic send/receive buffer handlers
+ */
+
+struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
+ unsigned long data_len, int noblock,
+ int *errcode, int max_page_order)
+{
+ struct sk_buff *skb;
+ long timeo;
+ int err;
+
+ timeo = sock_sndtimeo(sk, noblock);
+ for (;;) {
+ err = sock_error(sk);
+ if (err != 0)
+ goto failure;
+
+ err = -EPIPE;
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ goto failure;
+
+ if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
+ break;
+
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ err = -EAGAIN;
+ if (!timeo)
+ goto failure;
+ if (signal_pending(current))
+ goto interrupted;
+ timeo = sock_wait_for_wmem(sk, timeo);
+ }
+ skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
+ errcode, sk->sk_allocation);
+ if (skb)
+ skb_set_owner_w(skb, sk);
+ return skb;
+
+interrupted:
+ err = sock_intr_errno(timeo);
+failure:
+ *errcode = err;
+ return NULL;
+}
+EXPORT_SYMBOL(sock_alloc_send_pskb);
+
+struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
+ int noblock, int *errcode)
+{
+ return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
+}
+EXPORT_SYMBOL(sock_alloc_send_skb);
+
+int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
+ struct sockcm_cookie *sockc)
+{
+ u32 tsflags;
+
+ switch (cmsg->cmsg_type) {
+ case SO_MARK:
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->mark = *(u32 *)CMSG_DATA(cmsg);
+ break;
+ case SO_TIMESTAMPING:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+
+ tsflags = *(u32 *)CMSG_DATA(cmsg);
+ if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
+ return -EINVAL;
+
+ sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
+ sockc->tsflags |= tsflags;
+ break;
+ case SCM_TXTIME:
+ if (!sock_flag(sk, SOCK_TXTIME))
+ return -EINVAL;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
+ return -EINVAL;
+ sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
+ break;
+ /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
+ case SCM_RIGHTS:
+ case SCM_CREDENTIALS:
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(__sock_cmsg_send);
+
+int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
+ struct sockcm_cookie *sockc)
+{
+ struct cmsghdr *cmsg;
+ int ret;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+ if (cmsg->cmsg_level != SOL_SOCKET)
+ continue;
+ ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(sock_cmsg_send);
+
+static void sk_enter_memory_pressure(struct sock *sk)
+{
+ if (!sk->sk_prot->enter_memory_pressure)
+ return;
+
+ sk->sk_prot->enter_memory_pressure(sk);
+}
+
+static void sk_leave_memory_pressure(struct sock *sk)
+{
+ if (sk->sk_prot->leave_memory_pressure) {
+ sk->sk_prot->leave_memory_pressure(sk);
+ } else {
+ unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
+
+ if (memory_pressure && READ_ONCE(*memory_pressure))
+ WRITE_ONCE(*memory_pressure, 0);
+ }
+}
+
+/**
+ * skb_page_frag_refill - check that a page_frag contains enough room
+ * @sz: minimum size of the fragment we want to get
+ * @pfrag: pointer to page_frag
+ * @gfp: priority for memory allocation
+ *
+ * Note: While this allocator tries to use high order pages, there is
+ * no guarantee that allocations succeed. Therefore, @sz MUST be
+ * less or equal than PAGE_SIZE.
+ */
+bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
+{
+ if (pfrag->page) {
+ if (page_ref_count(pfrag->page) == 1) {
+ pfrag->offset = 0;
+ return true;
+ }
+ if (pfrag->offset + sz <= pfrag->size)
+ return true;
+ put_page(pfrag->page);
+ }
+
+ pfrag->offset = 0;
+ if (SKB_FRAG_PAGE_ORDER) {
+ /* Avoid direct reclaim but allow kswapd to wake */
+ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_COMP | __GFP_NOWARN |
+ __GFP_NORETRY,
+ SKB_FRAG_PAGE_ORDER);
+ if (likely(pfrag->page)) {
+ pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
+ return true;
+ }
+ }
+ pfrag->page = alloc_page(gfp);
+ if (likely(pfrag->page)) {
+ pfrag->size = PAGE_SIZE;
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(skb_page_frag_refill);
+
+bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
+{
+ if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
+ return true;
+
+ sk_enter_memory_pressure(sk);
+ sk_stream_moderate_sndbuf(sk);
+ return false;
+}
+EXPORT_SYMBOL(sk_page_frag_refill);
+
+int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
+ int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
+ int first_coalesce)
+{
+ int sg_curr = *sg_curr_index, use = 0, rc = 0;
+ unsigned int size = *sg_curr_size;
+ struct page_frag *pfrag;
+ struct scatterlist *sge;
+
+ len -= size;
+ pfrag = sk_page_frag(sk);
+
+ while (len > 0) {
+ unsigned int orig_offset;
+
+ if (!sk_page_frag_refill(sk, pfrag)) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ use = min_t(int, len, pfrag->size - pfrag->offset);
+
+ if (!sk_wmem_schedule(sk, use)) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ sk_mem_charge(sk, use);
+ size += use;
+ orig_offset = pfrag->offset;
+ pfrag->offset += use;
+
+ sge = sg + sg_curr - 1;
+ if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
+ sge->offset + sge->length == orig_offset) {
+ sge->length += use;
+ } else {
+ sge = sg + sg_curr;
+ sg_unmark_end(sge);
+ sg_set_page(sge, pfrag->page, use, orig_offset);
+ get_page(pfrag->page);
+ sg_curr++;
+
+ if (sg_curr == MAX_SKB_FRAGS)
+ sg_curr = 0;
+
+ if (sg_curr == sg_start) {
+ rc = -ENOSPC;
+ break;
+ }
+ }
+
+ len -= use;
+ }
+out:
+ *sg_curr_size = size;
+ *sg_curr_index = sg_curr;
+ return rc;
+}
+EXPORT_SYMBOL(sk_alloc_sg);
+
+static void __lock_sock(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+ __acquires(&sk->sk_lock.slock)
+{
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock_bh(&sk->sk_lock.slock);
+ schedule();
+ spin_lock_bh(&sk->sk_lock.slock);
+ if (!sock_owned_by_user(sk))
+ break;
+ }
+ finish_wait(&sk->sk_lock.wq, &wait);
+}
+
+void __release_sock(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+ __acquires(&sk->sk_lock.slock)
+{
+ struct sk_buff *skb, *next;
+
+ while ((skb = sk->sk_backlog.head) != NULL) {
+ sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
+
+ spin_unlock_bh(&sk->sk_lock.slock);
+
+ do {
+ next = skb->next;
+ prefetch(next);
+ WARN_ON_ONCE(skb_dst_is_noref(skb));
+ skb->next = NULL;
+ sk_backlog_rcv(sk, skb);
+
+ cond_resched();
+
+ skb = next;
+ } while (skb != NULL);
+
+ spin_lock_bh(&sk->sk_lock.slock);
+ }
+
+ /*
+ * Doing the zeroing here guarantee we can not loop forever
+ * while a wild producer attempts to flood us.
+ */
+ sk->sk_backlog.len = 0;
+}
+
+void __sk_flush_backlog(struct sock *sk)
+{
+ spin_lock_bh(&sk->sk_lock.slock);
+ __release_sock(sk);
+ spin_unlock_bh(&sk->sk_lock.slock);
+}
+
+/**
+ * sk_wait_data - wait for data to arrive at sk_receive_queue
+ * @sk: sock to wait on
+ * @timeo: for how long
+ * @skb: last skb seen on sk_receive_queue
+ *
+ * Now socket state including sk->sk_err is changed only under lock,
+ * hence we may omit checks after joining wait queue.
+ * We check receive queue before schedule() only as optimization;
+ * it is very likely that release_sock() added new data.
+ */
+int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int rc;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return rc;
+}
+EXPORT_SYMBOL(sk_wait_data);
+
+/**
+ * __sk_mem_raise_allocated - increase memory_allocated
+ * @sk: socket
+ * @size: memory size to allocate
+ * @amt: pages to allocate
+ * @kind: allocation type
+ *
+ * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
+ */
+int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
+{
+ struct proto *prot = sk->sk_prot;
+ long allocated = sk_memory_allocated_add(sk, amt);
+ bool charged = true;
+
+ if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
+ !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
+ goto suppress_allocation;
+
+ /* Under limit. */
+ if (allocated <= sk_prot_mem_limits(sk, 0)) {
+ sk_leave_memory_pressure(sk);
+ return 1;
+ }
+
+ /* Under pressure. */
+ if (allocated > sk_prot_mem_limits(sk, 1))
+ sk_enter_memory_pressure(sk);
+
+ /* Over hard limit. */
+ if (allocated > sk_prot_mem_limits(sk, 2))
+ goto suppress_allocation;
+
+ /* guarantee minimum buffer size under pressure */
+ if (kind == SK_MEM_RECV) {
+ if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
+ return 1;
+
+ } else { /* SK_MEM_SEND */
+ int wmem0 = sk_get_wmem0(sk, prot);
+
+ if (sk->sk_type == SOCK_STREAM) {
+ if (sk->sk_wmem_queued < wmem0)
+ return 1;
+ } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
+ return 1;
+ }
+ }
+
+ if (sk_has_memory_pressure(sk)) {
+ u64 alloc;
+
+ if (!sk_under_memory_pressure(sk))
+ return 1;
+ alloc = sk_sockets_allocated_read_positive(sk);
+ if (sk_prot_mem_limits(sk, 2) > alloc *
+ sk_mem_pages(sk->sk_wmem_queued +
+ atomic_read(&sk->sk_rmem_alloc) +
+ sk->sk_forward_alloc))
+ return 1;
+ }
+
+suppress_allocation:
+
+ if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
+ sk_stream_moderate_sndbuf(sk);
+
+ /* Fail only if socket is _under_ its sndbuf.
+ * In this case we cannot block, so that we have to fail.
+ */
+ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
+ return 1;
+ }
+
+ if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
+ trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
+
+ sk_memory_allocated_sub(sk, amt);
+
+ if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+ mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
+
+ return 0;
+}
+EXPORT_SYMBOL(__sk_mem_raise_allocated);
+
+/**
+ * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @size: memory size to allocate
+ * @kind: allocation type
+ *
+ * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ * rmem allocation. This function assumes that protocols which have
+ * memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+ int ret, amt = sk_mem_pages(size);
+
+ sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
+ ret = __sk_mem_raise_allocated(sk, size, amt, kind);
+ if (!ret)
+ sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
+ return ret;
+}
+EXPORT_SYMBOL(__sk_mem_schedule);
+
+/**
+ * __sk_mem_reduce_allocated - reclaim memory_allocated
+ * @sk: socket
+ * @amount: number of quanta
+ *
+ * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
+ */
+void __sk_mem_reduce_allocated(struct sock *sk, int amount)
+{
+ sk_memory_allocated_sub(sk, amount);
+
+ if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+ mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
+
+ if (sk_under_memory_pressure(sk) &&
+ (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
+ sk_leave_memory_pressure(sk);
+}
+EXPORT_SYMBOL(__sk_mem_reduce_allocated);
+
+/**
+ * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ */
+void __sk_mem_reclaim(struct sock *sk, int amount)
+{
+ amount >>= SK_MEM_QUANTUM_SHIFT;
+ sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+ __sk_mem_reduce_allocated(sk, amount);
+}
+EXPORT_SYMBOL(__sk_mem_reclaim);
+
+int sk_set_peek_off(struct sock *sk, int val)
+{
+ sk->sk_peek_off = val;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sk_set_peek_off);
+
+/*
+ * Set of default routines for initialising struct proto_ops when
+ * the protocol does not support a particular function. In certain
+ * cases where it makes no sense for a protocol to have a "do nothing"
+ * function, some default processing is provided.
+ */
+
+int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_bind);
+
+int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
+ int len, int flags)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_connect);
+
+int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_socketpair);
+
+int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_accept);
+
+int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
+ int peer)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_getname);
+
+int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_ioctl);
+
+int sock_no_listen(struct socket *sock, int backlog)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_listen);
+
+int sock_no_shutdown(struct socket *sock, int how)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_shutdown);
+
+int sock_no_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_setsockopt);
+
+int sock_no_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_getsockopt);
+
+int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_sendmsg);
+
+int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_sendmsg_locked);
+
+int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
+ int flags)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_recvmsg);
+
+int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
+{
+ /* Mirror missing mmap method error code */
+ return -ENODEV;
+}
+EXPORT_SYMBOL(sock_no_mmap);
+
+/*
+ * When a file is received (via SCM_RIGHTS, etc), we must bump the
+ * various sock-based usage counts.
+ */
+void __receive_sock(struct file *file)
+{
+ struct socket *sock;
+ int error;
+
+ /*
+ * The resulting value of "error" is ignored here since we only
+ * need to take action when the file is a socket and testing
+ * "sock" for NULL is sufficient.
+ */
+ sock = sock_from_file(file, &error);
+ if (sock) {
+ sock_update_netprioidx(&sock->sk->sk_cgrp_data);
+ sock_update_classid(&sock->sk->sk_cgrp_data);
+ }
+}
+
+ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+{
+ ssize_t res;
+ struct msghdr msg = {.msg_flags = flags};
+ struct kvec iov;
+ char *kaddr = kmap(page);
+ iov.iov_base = kaddr + offset;
+ iov.iov_len = size;
+ res = kernel_sendmsg(sock, &msg, &iov, 1, size);
+ kunmap(page);
+ return res;
+}
+EXPORT_SYMBOL(sock_no_sendpage);
+
+ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags)
+{
+ ssize_t res;
+ struct msghdr msg = {.msg_flags = flags};
+ struct kvec iov;
+ char *kaddr = kmap(page);
+
+ iov.iov_base = kaddr + offset;
+ iov.iov_len = size;
+ res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
+ kunmap(page);
+ return res;
+}
+EXPORT_SYMBOL(sock_no_sendpage_locked);
+
+/*
+ * Default Socket Callbacks
+ */
+
+static void sock_def_wakeup(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_all(&wq->wait);
+ rcu_read_unlock();
+}
+
+static void sock_def_error_report(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait, EPOLLERR);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
+ rcu_read_unlock();
+}
+
+static void sock_def_readable(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
+ EPOLLRDNORM | EPOLLRDBAND);
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ rcu_read_unlock();
+}
+
+static void sock_def_write_space(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+
+ /* Do not wake up a writer until he can make "significant"
+ * progress. --DaveM
+ */
+ if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
+ EPOLLWRNORM | EPOLLWRBAND);
+
+ /* Should agree with poll, otherwise some programs break */
+ if (sock_writeable(sk))
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ }
+
+ rcu_read_unlock();
+}
+
+static void sock_def_destruct(struct sock *sk)
+{
+}
+
+void sk_send_sigurg(struct sock *sk)
+{
+ if (sk->sk_socket && sk->sk_socket->file)
+ if (send_sigurg(&sk->sk_socket->file->f_owner))
+ sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
+}
+EXPORT_SYMBOL(sk_send_sigurg);
+
+void sk_reset_timer(struct sock *sk, struct timer_list* timer,
+ unsigned long expires)
+{
+ if (!mod_timer(timer, expires))
+ sock_hold(sk);
+}
+EXPORT_SYMBOL(sk_reset_timer);
+
+void sk_stop_timer(struct sock *sk, struct timer_list* timer)
+{
+ if (del_timer(timer))
+ __sock_put(sk);
+}
+EXPORT_SYMBOL(sk_stop_timer);
+
+void sock_init_data(struct socket *sock, struct sock *sk)
+{
+ sk_init_common(sk);
+ sk->sk_send_head = NULL;
+
+ timer_setup(&sk->sk_timer, NULL, 0);
+
+ sk->sk_allocation = GFP_KERNEL;
+ sk->sk_rcvbuf = sysctl_rmem_default;
+ sk->sk_sndbuf = sysctl_wmem_default;
+ sk->sk_state = TCP_CLOSE;
+ sk_set_socket(sk, sock);
+
+ sock_set_flag(sk, SOCK_ZAPPED);
+
+ if (sock) {
+ sk->sk_type = sock->type;
+ sk->sk_wq = sock->wq;
+ sock->sk = sk;
+ sk->sk_uid = SOCK_INODE(sock)->i_uid;
+ } else {
+ sk->sk_wq = NULL;
+ sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
+ }
+
+ rwlock_init(&sk->sk_callback_lock);
+ if (sk->sk_kern_sock)
+ lockdep_set_class_and_name(
+ &sk->sk_callback_lock,
+ af_kern_callback_keys + sk->sk_family,
+ af_family_kern_clock_key_strings[sk->sk_family]);
+ else
+ lockdep_set_class_and_name(
+ &sk->sk_callback_lock,
+ af_callback_keys + sk->sk_family,
+ af_family_clock_key_strings[sk->sk_family]);
+
+ sk->sk_state_change = sock_def_wakeup;
+ sk->sk_data_ready = sock_def_readable;
+ sk->sk_write_space = sock_def_write_space;
+ sk->sk_error_report = sock_def_error_report;
+ sk->sk_destruct = sock_def_destruct;
+
+ sk->sk_frag.page = NULL;
+ sk->sk_frag.offset = 0;
+ sk->sk_peek_off = -1;
+
+ sk->sk_peer_pid = NULL;
+ sk->sk_peer_cred = NULL;
+ spin_lock_init(&sk->sk_peer_lock);
+
+ sk->sk_write_pending = 0;
+ sk->sk_rcvlowat = 1;
+ sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+ sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+
+ sk->sk_stamp = SK_DEFAULT_STAMP;
+#if BITS_PER_LONG==32
+ seqlock_init(&sk->sk_stamp_seq);
+#endif
+ atomic_set(&sk->sk_zckey, 0);
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ sk->sk_napi_id = 0;
+ sk->sk_ll_usec = sysctl_net_busy_read;
+#endif
+
+ sk->sk_max_pacing_rate = ~0U;
+ sk->sk_pacing_rate = ~0U;
+ sk->sk_pacing_shift = 10;
+ sk->sk_incoming_cpu = -1;
+
+ sk_rx_queue_clear(sk);
+ /*
+ * Before updating sk_refcnt, we must commit prior changes to memory
+ * (Documentation/RCU/rculist_nulls.txt for details)
+ */
+ smp_wmb();
+ refcount_set(&sk->sk_refcnt, 1);
+ atomic_set(&sk->sk_drops, 0);
+}
+EXPORT_SYMBOL(sock_init_data);
+
+void lock_sock_nested(struct sock *sk, int subclass)
+{
+ might_sleep();
+ spin_lock_bh(&sk->sk_lock.slock);
+ if (sk->sk_lock.owned)
+ __lock_sock(sk);
+ sk->sk_lock.owned = 1;
+ spin_unlock(&sk->sk_lock.slock);
+ /*
+ * The sk_lock has mutex_lock() semantics here:
+ */
+ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
+ local_bh_enable();
+}
+EXPORT_SYMBOL(lock_sock_nested);
+
+void release_sock(struct sock *sk)
+{
+ spin_lock_bh(&sk->sk_lock.slock);
+ if (sk->sk_backlog.tail)
+ __release_sock(sk);
+
+ /* Warning : release_cb() might need to release sk ownership,
+ * ie call sock_release_ownership(sk) before us.
+ */
+ if (sk->sk_prot->release_cb)
+ sk->sk_prot->release_cb(sk);
+
+ sock_release_ownership(sk);
+ if (waitqueue_active(&sk->sk_lock.wq))
+ wake_up(&sk->sk_lock.wq);
+ spin_unlock_bh(&sk->sk_lock.slock);
+}
+EXPORT_SYMBOL(release_sock);
+
+/**
+ * lock_sock_fast - fast version of lock_sock
+ * @sk: socket
+ *
+ * This version should be used for very small section, where process wont block
+ * return false if fast path is taken:
+ *
+ * sk_lock.slock locked, owned = 0, BH disabled
+ *
+ * return true if slow path is taken:
+ *
+ * sk_lock.slock unlocked, owned = 1, BH enabled
+ */
+bool lock_sock_fast(struct sock *sk)
+{
+ might_sleep();
+ spin_lock_bh(&sk->sk_lock.slock);
+
+ if (!sk->sk_lock.owned)
+ /*
+ * Note : We must disable BH
+ */
+ return false;
+
+ __lock_sock(sk);
+ sk->sk_lock.owned = 1;
+ spin_unlock(&sk->sk_lock.slock);
+ /*
+ * The sk_lock has mutex_lock() semantics here:
+ */
+ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
+ local_bh_enable();
+ return true;
+}
+EXPORT_SYMBOL(lock_sock_fast);
+
+int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
+{
+ struct timeval tv;
+
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ tv = ktime_to_timeval(sock_read_timestamp(sk));
+ if (tv.tv_sec == -1)
+ return -ENOENT;
+ if (tv.tv_sec == 0) {
+ ktime_t kt = ktime_get_real();
+ sock_write_timestamp(sk, kt);
+ tv = ktime_to_timeval(kt);
+ }
+ return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL(sock_get_timestamp);
+
+int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
+{
+ struct timespec ts;
+
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ ts = ktime_to_timespec(sock_read_timestamp(sk));
+ if (ts.tv_sec == -1)
+ return -ENOENT;
+ if (ts.tv_sec == 0) {
+ ktime_t kt = ktime_get_real();
+ sock_write_timestamp(sk, kt);
+ ts = ktime_to_timespec(sk->sk_stamp);
+ }
+ return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL(sock_get_timestampns);
+
+void sock_enable_timestamp(struct sock *sk, int flag)
+{
+ if (!sock_flag(sk, flag)) {
+ unsigned long previous_flags = sk->sk_flags;
+
+ sock_set_flag(sk, flag);
+ /*
+ * we just set one of the two flags which require net
+ * time stamping, but time stamping might have been on
+ * already because of the other one
+ */
+ if (sock_needs_netstamp(sk) &&
+ !(previous_flags & SK_FLAGS_TIMESTAMP))
+ net_enable_timestamp();
+ }
+}
+
+int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
+ int level, int type)
+{
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb;
+ int copied, err;
+
+ err = -EAGAIN;
+ skb = sock_dequeue_err_skb(sk);
+ if (skb == NULL)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto out_free_skb;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ serr = SKB_EXT_ERR(skb);
+ put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
+
+ msg->msg_flags |= MSG_ERRQUEUE;
+ err = copied;
+
+out_free_skb:
+ kfree_skb(skb);
+out:
+ return err;
+}
+EXPORT_SYMBOL(sock_recv_errqueue);
+
+/*
+ * Get a socket option on an socket.
+ *
+ * FIX: POSIX 1003.1g is very ambiguous here. It states that
+ * asynchronous errors should be reported by getsockopt. We assume
+ * this means if you specify SO_ERROR (otherwise whats the point of it).
+ */
+int sock_common_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+
+ return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(sock_common_getsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk->sk_prot->compat_getsockopt != NULL)
+ return sk->sk_prot->compat_getsockopt(sk, level, optname,
+ optval, optlen);
+ return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_sock_common_getsockopt);
+#endif
+
+int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ int addr_len = 0;
+ int err;
+
+ err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
+ flags & ~MSG_DONTWAIT, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+EXPORT_SYMBOL(sock_common_recvmsg);
+
+/*
+ * Set socket options on an inet socket.
+ */
+int sock_common_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+
+ return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(sock_common_setsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk->sk_prot->compat_setsockopt != NULL)
+ return sk->sk_prot->compat_setsockopt(sk, level, optname,
+ optval, optlen);
+ return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_sock_common_setsockopt);
+#endif
+
+void sk_common_release(struct sock *sk)
+{
+ if (sk->sk_prot->destroy)
+ sk->sk_prot->destroy(sk);
+
+ /*
+ * Observation: when sock_common_release is called, processes have
+ * no access to socket. But net still has.
+ * Step one, detach it from networking:
+ *
+ * A. Remove from hash tables.
+ */
+
+ sk->sk_prot->unhash(sk);
+
+ /*
+ * In this point socket cannot receive new packets, but it is possible
+ * that some packets are in flight because some CPU runs receiver and
+ * did hash table lookup before we unhashed socket. They will achieve
+ * receive queue and will be purged by socket destructor.
+ *
+ * Also we still have packets pending on receive queue and probably,
+ * our own packets waiting in device queues. sock_destroy will drain
+ * receive queue, but transmitted packets will delay socket destruction
+ * until the last reference will be released.
+ */
+
+ sock_orphan(sk);
+
+ xfrm_sk_free_policy(sk);
+
+ sk_refcnt_debug_release(sk);
+
+ sock_put(sk);
+}
+EXPORT_SYMBOL(sk_common_release);
+
+void sk_get_meminfo(const struct sock *sk, u32 *mem)
+{
+ memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
+
+ mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
+ mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
+ mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
+ mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
+ mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
+ mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
+ mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
+ mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
+ mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
+}
+
+#ifdef CONFIG_PROC_FS
+#define PROTO_INUSE_NR 64 /* should be enough for the first time */
+struct prot_inuse {
+ int val[PROTO_INUSE_NR];
+};
+
+static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
+
+void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
+{
+ __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
+
+int sock_prot_inuse_get(struct net *net, struct proto *prot)
+{
+ int cpu, idx = prot->inuse_idx;
+ int res = 0;
+
+ for_each_possible_cpu(cpu)
+ res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
+
+ return res >= 0 ? res : 0;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
+
+static void sock_inuse_add(struct net *net, int val)
+{
+ this_cpu_add(*net->core.sock_inuse, val);
+}
+
+int sock_inuse_get(struct net *net)
+{
+ int cpu, res = 0;
+
+ for_each_possible_cpu(cpu)
+ res += *per_cpu_ptr(net->core.sock_inuse, cpu);
+
+ return res;
+}
+
+EXPORT_SYMBOL_GPL(sock_inuse_get);
+
+static int __net_init sock_inuse_init_net(struct net *net)
+{
+ net->core.prot_inuse = alloc_percpu(struct prot_inuse);
+ if (net->core.prot_inuse == NULL)
+ return -ENOMEM;
+
+ net->core.sock_inuse = alloc_percpu(int);
+ if (net->core.sock_inuse == NULL)
+ goto out;
+
+ return 0;
+
+out:
+ free_percpu(net->core.prot_inuse);
+ return -ENOMEM;
+}
+
+static void __net_exit sock_inuse_exit_net(struct net *net)
+{
+ free_percpu(net->core.prot_inuse);
+ free_percpu(net->core.sock_inuse);
+}
+
+static struct pernet_operations net_inuse_ops = {
+ .init = sock_inuse_init_net,
+ .exit = sock_inuse_exit_net,
+};
+
+static __init int net_inuse_init(void)
+{
+ if (register_pernet_subsys(&net_inuse_ops))
+ panic("Cannot initialize net inuse counters");
+
+ return 0;
+}
+
+core_initcall(net_inuse_init);
+
+static void assign_proto_idx(struct proto *prot)
+{
+ prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
+
+ if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
+ pr_err("PROTO_INUSE_NR exhausted\n");
+ return;
+ }
+
+ set_bit(prot->inuse_idx, proto_inuse_idx);
+}
+
+static void release_proto_idx(struct proto *prot)
+{
+ if (prot->inuse_idx != PROTO_INUSE_NR - 1)
+ clear_bit(prot->inuse_idx, proto_inuse_idx);
+}
+#else
+static inline void assign_proto_idx(struct proto *prot)
+{
+}
+
+static inline void release_proto_idx(struct proto *prot)
+{
+}
+
+static void sock_inuse_add(struct net *net, int val)
+{
+}
+#endif
+
+static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
+{
+ if (!rsk_prot)
+ return;
+ kfree(rsk_prot->slab_name);
+ rsk_prot->slab_name = NULL;
+ kmem_cache_destroy(rsk_prot->slab);
+ rsk_prot->slab = NULL;
+}
+
+static int req_prot_init(const struct proto *prot)
+{
+ struct request_sock_ops *rsk_prot = prot->rsk_prot;
+
+ if (!rsk_prot)
+ return 0;
+
+ rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
+ prot->name);
+ if (!rsk_prot->slab_name)
+ return -ENOMEM;
+
+ rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
+ rsk_prot->obj_size, 0,
+ SLAB_ACCOUNT | prot->slab_flags,
+ NULL);
+
+ if (!rsk_prot->slab) {
+ pr_crit("%s: Can't create request sock SLAB cache!\n",
+ prot->name);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int proto_register(struct proto *prot, int alloc_slab)
+{
+ if (alloc_slab) {
+ prot->slab = kmem_cache_create_usercopy(prot->name,
+ prot->obj_size, 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
+ prot->slab_flags,
+ prot->useroffset, prot->usersize,
+ NULL);
+
+ if (prot->slab == NULL) {
+ pr_crit("%s: Can't create sock SLAB cache!\n",
+ prot->name);
+ goto out;
+ }
+
+ if (req_prot_init(prot))
+ goto out_free_request_sock_slab;
+
+ if (prot->twsk_prot != NULL) {
+ prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
+
+ if (prot->twsk_prot->twsk_slab_name == NULL)
+ goto out_free_request_sock_slab;
+
+ prot->twsk_prot->twsk_slab =
+ kmem_cache_create(prot->twsk_prot->twsk_slab_name,
+ prot->twsk_prot->twsk_obj_size,
+ 0,
+ SLAB_ACCOUNT |
+ prot->slab_flags,
+ NULL);
+ if (prot->twsk_prot->twsk_slab == NULL)
+ goto out_free_timewait_sock_slab_name;
+ }
+ }
+
+ mutex_lock(&proto_list_mutex);
+ list_add(&prot->node, &proto_list);
+ assign_proto_idx(prot);
+ mutex_unlock(&proto_list_mutex);
+ return 0;
+
+out_free_timewait_sock_slab_name:
+ kfree(prot->twsk_prot->twsk_slab_name);
+out_free_request_sock_slab:
+ req_prot_cleanup(prot->rsk_prot);
+
+ kmem_cache_destroy(prot->slab);
+ prot->slab = NULL;
+out:
+ return -ENOBUFS;
+}
+EXPORT_SYMBOL(proto_register);
+
+void proto_unregister(struct proto *prot)
+{
+ mutex_lock(&proto_list_mutex);
+ release_proto_idx(prot);
+ list_del(&prot->node);
+ mutex_unlock(&proto_list_mutex);
+
+ kmem_cache_destroy(prot->slab);
+ prot->slab = NULL;
+
+ req_prot_cleanup(prot->rsk_prot);
+
+ if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
+ kmem_cache_destroy(prot->twsk_prot->twsk_slab);
+ kfree(prot->twsk_prot->twsk_slab_name);
+ prot->twsk_prot->twsk_slab = NULL;
+ }
+}
+EXPORT_SYMBOL(proto_unregister);
+
+int sock_load_diag_module(int family, int protocol)
+{
+ if (!protocol) {
+ if (!sock_is_registered(family))
+ return -ENOENT;
+
+ return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, family);
+ }
+
+#ifdef CONFIG_INET
+ if (family == AF_INET &&
+ protocol != IPPROTO_RAW &&
+ !rcu_access_pointer(inet_protos[protocol]))
+ return -ENOENT;
+#endif
+
+ return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, family, protocol);
+}
+EXPORT_SYMBOL(sock_load_diag_module);
+
+#ifdef CONFIG_PROC_FS
+static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(proto_list_mutex)
+{
+ mutex_lock(&proto_list_mutex);
+ return seq_list_start_head(&proto_list, *pos);
+}
+
+static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &proto_list, pos);
+}
+
+static void proto_seq_stop(struct seq_file *seq, void *v)
+ __releases(proto_list_mutex)
+{
+ mutex_unlock(&proto_list_mutex);
+}
+
+static char proto_method_implemented(const void *method)
+{
+ return method == NULL ? 'n' : 'y';
+}
+static long sock_prot_memory_allocated(struct proto *proto)
+{
+ return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
+}
+
+static char *sock_prot_memory_pressure(struct proto *proto)
+{
+ return proto->memory_pressure != NULL ?
+ proto_memory_pressure(proto) ? "yes" : "no" : "NI";
+}
+
+static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
+{
+
+ seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
+ "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
+ proto->name,
+ proto->obj_size,
+ sock_prot_inuse_get(seq_file_net(seq), proto),
+ sock_prot_memory_allocated(proto),
+ sock_prot_memory_pressure(proto),
+ proto->max_header,
+ proto->slab == NULL ? "no" : "yes",
+ module_name(proto->owner),
+ proto_method_implemented(proto->close),
+ proto_method_implemented(proto->connect),
+ proto_method_implemented(proto->disconnect),
+ proto_method_implemented(proto->accept),
+ proto_method_implemented(proto->ioctl),
+ proto_method_implemented(proto->init),
+ proto_method_implemented(proto->destroy),
+ proto_method_implemented(proto->shutdown),
+ proto_method_implemented(proto->setsockopt),
+ proto_method_implemented(proto->getsockopt),
+ proto_method_implemented(proto->sendmsg),
+ proto_method_implemented(proto->recvmsg),
+ proto_method_implemented(proto->sendpage),
+ proto_method_implemented(proto->bind),
+ proto_method_implemented(proto->backlog_rcv),
+ proto_method_implemented(proto->hash),
+ proto_method_implemented(proto->unhash),
+ proto_method_implemented(proto->get_port),
+ proto_method_implemented(proto->enter_memory_pressure));
+}
+
+static int proto_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == &proto_list)
+ seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
+ "protocol",
+ "size",
+ "sockets",
+ "memory",
+ "press",
+ "maxhdr",
+ "slab",
+ "module",
+ "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
+ else
+ proto_seq_printf(seq, list_entry(v, struct proto, node));
+ return 0;
+}
+
+static const struct seq_operations proto_seq_ops = {
+ .start = proto_seq_start,
+ .next = proto_seq_next,
+ .stop = proto_seq_stop,
+ .show = proto_seq_show,
+};
+
+static __net_init int proto_init_net(struct net *net)
+{
+ if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
+ sizeof(struct seq_net_private)))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static __net_exit void proto_exit_net(struct net *net)
+{
+ remove_proc_entry("protocols", net->proc_net);
+}
+
+
+static __net_initdata struct pernet_operations proto_net_ops = {
+ .init = proto_init_net,
+ .exit = proto_exit_net,
+};
+
+static int __init proto_init(void)
+{
+ return register_pernet_subsys(&proto_net_ops);
+}
+
+subsys_initcall(proto_init);
+
+#endif /* PROC_FS */
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+bool sk_busy_loop_end(void *p, unsigned long start_time)
+{
+ struct sock *sk = p;
+
+ return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
+ sk_busy_loop_timeout(sk, start_time);
+}
+EXPORT_SYMBOL(sk_busy_loop_end);
+#endif /* CONFIG_NET_RX_BUSY_POLL */
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
new file mode 100644
index 000000000..3312a5849
--- /dev/null
+++ b/net/core/sock_diag.c
@@ -0,0 +1,337 @@
+/* License: GPL */
+
+#include <linux/mutex.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <linux/module.h>
+#include <net/sock.h>
+#include <linux/kernel.h>
+#include <linux/tcp.h>
+#include <linux/workqueue.h>
+#include <linux/nospec.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
+static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
+static DEFINE_MUTEX(sock_diag_table_mutex);
+static struct workqueue_struct *broadcast_wq;
+
+u64 sock_gen_cookie(struct sock *sk)
+{
+ while (1) {
+ u64 res = atomic64_read(&sk->sk_cookie);
+
+ if (res)
+ return res;
+ res = atomic64_inc_return(&sock_net(sk)->cookie_gen);
+ atomic64_cmpxchg(&sk->sk_cookie, 0, res);
+ }
+}
+
+int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie)
+{
+ u64 res;
+
+ if (cookie[0] == INET_DIAG_NOCOOKIE && cookie[1] == INET_DIAG_NOCOOKIE)
+ return 0;
+
+ res = sock_gen_cookie(sk);
+ if ((u32)res != cookie[0] || (u32)(res >> 32) != cookie[1])
+ return -ESTALE;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sock_diag_check_cookie);
+
+void sock_diag_save_cookie(struct sock *sk, __u32 *cookie)
+{
+ u64 res = sock_gen_cookie(sk);
+
+ cookie[0] = (u32)res;
+ cookie[1] = (u32)(res >> 32);
+}
+EXPORT_SYMBOL_GPL(sock_diag_save_cookie);
+
+int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
+{
+ u32 mem[SK_MEMINFO_VARS];
+
+ sk_get_meminfo(sk, mem);
+
+ return nla_put(skb, attrtype, sizeof(mem), &mem);
+}
+EXPORT_SYMBOL_GPL(sock_diag_put_meminfo);
+
+int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
+ struct sk_buff *skb, int attrtype)
+{
+ struct sock_fprog_kern *fprog;
+ struct sk_filter *filter;
+ struct nlattr *attr;
+ unsigned int flen;
+ int err = 0;
+
+ if (!may_report_filterinfo) {
+ nla_reserve(skb, attrtype, 0);
+ return 0;
+ }
+
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (!filter)
+ goto out;
+
+ fprog = filter->prog->orig_prog;
+ if (!fprog)
+ goto out;
+
+ flen = bpf_classic_proglen(fprog);
+
+ attr = nla_reserve(skb, attrtype, flen);
+ if (attr == NULL) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ memcpy(nla_data(attr), fprog->filter, flen);
+out:
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(sock_diag_put_filterinfo);
+
+struct broadcast_sk {
+ struct sock *sk;
+ struct work_struct work;
+};
+
+static size_t sock_diag_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct inet_diag_msg)
+ + nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */
+ + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
+}
+
+static void sock_diag_broadcast_destroy_work(struct work_struct *work)
+{
+ struct broadcast_sk *bsk =
+ container_of(work, struct broadcast_sk, work);
+ struct sock *sk = bsk->sk;
+ const struct sock_diag_handler *hndl;
+ struct sk_buff *skb;
+ const enum sknetlink_groups group = sock_diag_destroy_group(sk);
+ int err = -1;
+
+ WARN_ON(group == SKNLGRP_NONE);
+
+ skb = nlmsg_new(sock_diag_nlmsg_size(), GFP_KERNEL);
+ if (!skb)
+ goto out;
+
+ mutex_lock(&sock_diag_table_mutex);
+ hndl = sock_diag_handlers[sk->sk_family];
+ if (hndl && hndl->get_info)
+ err = hndl->get_info(skb, sk);
+ mutex_unlock(&sock_diag_table_mutex);
+
+ if (!err)
+ nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group,
+ GFP_KERNEL);
+ else
+ kfree_skb(skb);
+out:
+ sk_destruct(sk);
+ kfree(bsk);
+}
+
+void sock_diag_broadcast_destroy(struct sock *sk)
+{
+ /* Note, this function is often called from an interrupt context. */
+ struct broadcast_sk *bsk =
+ kmalloc(sizeof(struct broadcast_sk), GFP_ATOMIC);
+ if (!bsk)
+ return sk_destruct(sk);
+ bsk->sk = sk;
+ INIT_WORK(&bsk->work, sock_diag_broadcast_destroy_work);
+ queue_work(broadcast_wq, &bsk->work);
+}
+
+void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+{
+ mutex_lock(&sock_diag_table_mutex);
+ inet_rcv_compat = fn;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
+
+void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+{
+ mutex_lock(&sock_diag_table_mutex);
+ inet_rcv_compat = NULL;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
+
+int sock_diag_register(const struct sock_diag_handler *hndl)
+{
+ int err = 0;
+
+ if (hndl->family >= AF_MAX)
+ return -EINVAL;
+
+ mutex_lock(&sock_diag_table_mutex);
+ if (sock_diag_handlers[hndl->family])
+ err = -EBUSY;
+ else
+ sock_diag_handlers[hndl->family] = hndl;
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sock_diag_register);
+
+void sock_diag_unregister(const struct sock_diag_handler *hnld)
+{
+ int family = hnld->family;
+
+ if (family >= AF_MAX)
+ return;
+
+ mutex_lock(&sock_diag_table_mutex);
+ BUG_ON(sock_diag_handlers[family] != hnld);
+ sock_diag_handlers[family] = NULL;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_unregister);
+
+static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int err;
+ struct sock_diag_req *req = nlmsg_data(nlh);
+ const struct sock_diag_handler *hndl;
+
+ if (nlmsg_len(nlh) < sizeof(*req))
+ return -EINVAL;
+
+ if (req->sdiag_family >= AF_MAX)
+ return -EINVAL;
+ req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX);
+
+ if (sock_diag_handlers[req->sdiag_family] == NULL)
+ sock_load_diag_module(req->sdiag_family, 0);
+
+ mutex_lock(&sock_diag_table_mutex);
+ hndl = sock_diag_handlers[req->sdiag_family];
+ if (hndl == NULL)
+ err = -ENOENT;
+ else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
+ err = hndl->dump(skb, nlh);
+ else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy)
+ err = hndl->destroy(skb, nlh);
+ else
+ err = -EOPNOTSUPP;
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return err;
+}
+
+static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ switch (nlh->nlmsg_type) {
+ case TCPDIAG_GETSOCK:
+ case DCCPDIAG_GETSOCK:
+ if (inet_rcv_compat == NULL)
+ sock_load_diag_module(AF_INET, 0);
+
+ mutex_lock(&sock_diag_table_mutex);
+ if (inet_rcv_compat != NULL)
+ ret = inet_rcv_compat(skb, nlh);
+ else
+ ret = -EOPNOTSUPP;
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return ret;
+ case SOCK_DIAG_BY_FAMILY:
+ case SOCK_DESTROY:
+ return __sock_diag_cmd(skb, nlh);
+ default:
+ return -EINVAL;
+ }
+}
+
+static DEFINE_MUTEX(sock_diag_mutex);
+
+static void sock_diag_rcv(struct sk_buff *skb)
+{
+ mutex_lock(&sock_diag_mutex);
+ netlink_rcv_skb(skb, &sock_diag_rcv_msg);
+ mutex_unlock(&sock_diag_mutex);
+}
+
+static int sock_diag_bind(struct net *net, int group)
+{
+ switch (group) {
+ case SKNLGRP_INET_TCP_DESTROY:
+ case SKNLGRP_INET_UDP_DESTROY:
+ if (!sock_diag_handlers[AF_INET])
+ sock_load_diag_module(AF_INET, 0);
+ break;
+ case SKNLGRP_INET6_TCP_DESTROY:
+ case SKNLGRP_INET6_UDP_DESTROY:
+ if (!sock_diag_handlers[AF_INET6])
+ sock_load_diag_module(AF_INET6, 0);
+ break;
+ }
+ return 0;
+}
+
+int sock_diag_destroy(struct sock *sk, int err)
+{
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!sk->sk_prot->diag_destroy)
+ return -EOPNOTSUPP;
+
+ return sk->sk_prot->diag_destroy(sk, err);
+}
+EXPORT_SYMBOL_GPL(sock_diag_destroy);
+
+static int __net_init diag_net_init(struct net *net)
+{
+ struct netlink_kernel_cfg cfg = {
+ .groups = SKNLGRP_MAX,
+ .input = sock_diag_rcv,
+ .bind = sock_diag_bind,
+ .flags = NL_CFG_F_NONROOT_RECV,
+ };
+
+ net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg);
+ return net->diag_nlsk == NULL ? -ENOMEM : 0;
+}
+
+static void __net_exit diag_net_exit(struct net *net)
+{
+ netlink_kernel_release(net->diag_nlsk);
+ net->diag_nlsk = NULL;
+}
+
+static struct pernet_operations diag_net_ops = {
+ .init = diag_net_init,
+ .exit = diag_net_exit,
+};
+
+static int __init sock_diag_init(void)
+{
+ broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0);
+ BUG_ON(!broadcast_wq);
+ return register_pernet_subsys(&diag_net_ops);
+}
+device_initcall(sock_diag_init);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
new file mode 100644
index 000000000..375a3bbe6
--- /dev/null
+++ b/net/core/sock_reuseport.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * To speed up listener socket lookup, create an array to store all sockets
+ * listening on the same port. This allows a decision to be made after finding
+ * the first socket. An optional BPF program can also be configured for
+ * selecting the socket index from the array of available sockets.
+ */
+
+#include <net/sock_reuseport.h>
+#include <linux/bpf.h>
+#include <linux/idr.h>
+#include <linux/filter.h>
+#include <linux/rcupdate.h>
+
+#define INIT_SOCKS 128
+
+DEFINE_SPINLOCK(reuseport_lock);
+
+#define REUSEPORT_MIN_ID 1
+static DEFINE_IDA(reuseport_ida);
+
+int reuseport_get_id(struct sock_reuseport *reuse)
+{
+ int id;
+
+ if (reuse->reuseport_id)
+ return reuse->reuseport_id;
+
+ id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0,
+ /* Called under reuseport_lock */
+ GFP_ATOMIC);
+ if (id < 0)
+ return id;
+
+ reuse->reuseport_id = id;
+
+ return reuse->reuseport_id;
+}
+
+static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
+{
+ unsigned int size = sizeof(struct sock_reuseport) +
+ sizeof(struct sock *) * max_socks;
+ struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
+
+ if (!reuse)
+ return NULL;
+
+ reuse->max_socks = max_socks;
+
+ RCU_INIT_POINTER(reuse->prog, NULL);
+ return reuse;
+}
+
+int reuseport_alloc(struct sock *sk, bool bind_inany)
+{
+ struct sock_reuseport *reuse;
+
+ /* bh lock used since this function call may precede hlist lock in
+ * soft irq of receive path or setsockopt from process context
+ */
+ spin_lock_bh(&reuseport_lock);
+
+ /* Allocation attempts can occur concurrently via the setsockopt path
+ * and the bind/hash path. Nothing to do when we lose the race.
+ */
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ if (reuse) {
+ /* Only set reuse->bind_inany if the bind_inany is true.
+ * Otherwise, it will overwrite the reuse->bind_inany
+ * which was set by the bind/hash path.
+ */
+ if (bind_inany)
+ reuse->bind_inany = bind_inany;
+ goto out;
+ }
+
+ reuse = __reuseport_alloc(INIT_SOCKS);
+ if (!reuse) {
+ spin_unlock_bh(&reuseport_lock);
+ return -ENOMEM;
+ }
+
+ reuse->socks[0] = sk;
+ reuse->num_socks = 1;
+ reuse->bind_inany = bind_inany;
+ rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+out:
+ spin_unlock_bh(&reuseport_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(reuseport_alloc);
+
+static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
+{
+ struct sock_reuseport *more_reuse;
+ u32 more_socks_size, i;
+
+ more_socks_size = reuse->max_socks * 2U;
+ if (more_socks_size > U16_MAX)
+ return NULL;
+
+ more_reuse = __reuseport_alloc(more_socks_size);
+ if (!more_reuse)
+ return NULL;
+
+ more_reuse->max_socks = more_socks_size;
+ more_reuse->num_socks = reuse->num_socks;
+ more_reuse->prog = reuse->prog;
+ more_reuse->reuseport_id = reuse->reuseport_id;
+ more_reuse->bind_inany = reuse->bind_inany;
+ more_reuse->has_conns = reuse->has_conns;
+
+ memcpy(more_reuse->socks, reuse->socks,
+ reuse->num_socks * sizeof(struct sock *));
+ more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
+
+ for (i = 0; i < reuse->num_socks; ++i)
+ rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
+ more_reuse);
+
+ /* Note: we use kfree_rcu here instead of reuseport_free_rcu so
+ * that reuse and more_reuse can temporarily share a reference
+ * to prog.
+ */
+ kfree_rcu(reuse, rcu);
+ return more_reuse;
+}
+
+static void reuseport_free_rcu(struct rcu_head *head)
+{
+ struct sock_reuseport *reuse;
+
+ reuse = container_of(head, struct sock_reuseport, rcu);
+ sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
+ if (reuse->reuseport_id)
+ ida_simple_remove(&reuseport_ida, reuse->reuseport_id);
+ kfree(reuse);
+}
+
+/**
+ * reuseport_add_sock - Add a socket to the reuseport group of another.
+ * @sk: New socket to add to the group.
+ * @sk2: Socket belonging to the existing reuseport group.
+ * May return ENOMEM and not add socket to group under memory pressure.
+ */
+int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
+{
+ struct sock_reuseport *old_reuse, *reuse;
+
+ if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
+ int err = reuseport_alloc(sk2, bind_inany);
+
+ if (err)
+ return err;
+ }
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ if (old_reuse && old_reuse->num_socks != 1) {
+ spin_unlock_bh(&reuseport_lock);
+ return -EBUSY;
+ }
+
+ if (reuse->num_socks == reuse->max_socks) {
+ reuse = reuseport_grow(reuse);
+ if (!reuse) {
+ spin_unlock_bh(&reuseport_lock);
+ return -ENOMEM;
+ }
+ }
+
+ reuse->socks[reuse->num_socks] = sk;
+ /* paired with smp_rmb() in reuseport_select_sock() */
+ smp_wmb();
+ reuse->num_socks++;
+ rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+ spin_unlock_bh(&reuseport_lock);
+
+ if (old_reuse)
+ call_rcu(&old_reuse->rcu, reuseport_free_rcu);
+ return 0;
+}
+
+void reuseport_detach_sock(struct sock *sk)
+{
+ struct sock_reuseport *reuse;
+ int i;
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+
+ /* At least one of the sk in this reuseport group is added to
+ * a bpf map. Notify the bpf side. The bpf map logic will
+ * remove the sk if it is indeed added to a bpf map.
+ */
+ if (reuse->reuseport_id)
+ bpf_sk_reuseport_detach(sk);
+
+ rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
+
+ for (i = 0; i < reuse->num_socks; i++) {
+ if (reuse->socks[i] == sk) {
+ reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
+ reuse->num_socks--;
+ if (reuse->num_socks == 0)
+ call_rcu(&reuse->rcu, reuseport_free_rcu);
+ break;
+ }
+ }
+ spin_unlock_bh(&reuseport_lock);
+}
+EXPORT_SYMBOL(reuseport_detach_sock);
+
+static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
+ struct bpf_prog *prog, struct sk_buff *skb,
+ int hdr_len)
+{
+ struct sk_buff *nskb = NULL;
+ u32 index;
+
+ if (skb_shared(skb)) {
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+ skb = nskb;
+ }
+
+ /* temporarily advance data past protocol header */
+ if (!pskb_pull(skb, hdr_len)) {
+ kfree_skb(nskb);
+ return NULL;
+ }
+ index = bpf_prog_run_save_cb(prog, skb);
+ __skb_push(skb, hdr_len);
+
+ consume_skb(nskb);
+
+ if (index >= socks)
+ return NULL;
+
+ return reuse->socks[index];
+}
+
+/**
+ * reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
+ * @sk: First socket in the group.
+ * @hash: When no BPF filter is available, use this hash to select.
+ * @skb: skb to run through BPF filter.
+ * @hdr_len: BPF filter expects skb data pointer at payload data. If
+ * the skb does not yet point at the payload, this parameter represents
+ * how far the pointer needs to advance to reach the payload.
+ * Returns a socket that should receive the packet (or NULL on error).
+ */
+struct sock *reuseport_select_sock(struct sock *sk,
+ u32 hash,
+ struct sk_buff *skb,
+ int hdr_len)
+{
+ struct sock_reuseport *reuse;
+ struct bpf_prog *prog;
+ struct sock *sk2 = NULL;
+ u16 socks;
+
+ rcu_read_lock();
+ reuse = rcu_dereference(sk->sk_reuseport_cb);
+
+ /* if memory allocation failed or add call is not yet complete */
+ if (!reuse)
+ goto out;
+
+ prog = rcu_dereference(reuse->prog);
+ socks = READ_ONCE(reuse->num_socks);
+ if (likely(socks)) {
+ /* paired with smp_wmb() in reuseport_add_sock() */
+ smp_rmb();
+
+ if (!prog || !skb)
+ goto select_by_hash;
+
+ if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
+ sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
+ else
+ sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
+
+select_by_hash:
+ /* no bpf or invalid bpf result: fall back to hash usage */
+ if (!sk2) {
+ int i, j;
+
+ i = j = reciprocal_scale(hash, socks);
+ while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
+ i++;
+ if (i >= socks)
+ i = 0;
+ if (i == j)
+ goto out;
+ }
+ sk2 = reuse->socks[i];
+ }
+ }
+
+out:
+ rcu_read_unlock();
+ return sk2;
+}
+EXPORT_SYMBOL(reuseport_select_sock);
+
+int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
+{
+ struct sock_reuseport *reuse;
+ struct bpf_prog *old_prog;
+
+ if (sk_unhashed(sk) && sk->sk_reuseport) {
+ int err = reuseport_alloc(sk, false);
+
+ if (err)
+ return err;
+ } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
+ /* The socket wasn't bound with SO_REUSEPORT */
+ return -EINVAL;
+ }
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ old_prog = rcu_dereference_protected(reuse->prog,
+ lockdep_is_held(&reuseport_lock));
+ rcu_assign_pointer(reuse->prog, prog);
+ spin_unlock_bh(&reuseport_lock);
+
+ sk_reuseport_prog_free(old_prog);
+ return 0;
+}
+EXPORT_SYMBOL(reuseport_attach_prog);
diff --git a/net/core/stream.c b/net/core/stream.c
new file mode 100644
index 000000000..3d98774cf
--- /dev/null
+++ b/net/core/stream.c
@@ -0,0 +1,212 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * SUCS NET3:
+ *
+ * Generic stream handling routines. These are generic for most
+ * protocols. Even IP. Tonight 8-).
+ * This is used because TCP, LLC (others too) layer all have mostly
+ * identical sendmsg() and recvmsg() code.
+ * So we (will) share it here.
+ *
+ * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * (from old tcp.c code)
+ * Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-))
+ */
+
+#include <linux/module.h>
+#include <linux/sched/signal.h>
+#include <linux/net.h>
+#include <linux/signal.h>
+#include <linux/tcp.h>
+#include <linux/wait.h>
+#include <net/sock.h>
+
+/**
+ * sk_stream_write_space - stream socket write_space callback.
+ * @sk: socket
+ *
+ * FIXME: write proper description
+ */
+void sk_stream_write_space(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+ struct socket_wq *wq;
+
+ if (sk_stream_is_writeable(sk) && sock) {
+ clear_bit(SOCK_NOSPACE, &sock->flags);
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait, EPOLLOUT |
+ EPOLLWRNORM | EPOLLWRBAND);
+ if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
+ sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
+ rcu_read_unlock();
+ }
+}
+
+/**
+ * sk_stream_wait_connect - Wait for a socket to get into the connected state
+ * @sk: sock to wait on
+ * @timeo_p: for how long to wait
+ *
+ * Must be called with the socket locked.
+ */
+int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct task_struct *tsk = current;
+ int done;
+
+ do {
+ int err = sock_error(sk);
+ if (err)
+ return err;
+ if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
+ return -EPIPE;
+ if (!*timeo_p)
+ return -EAGAIN;
+ if (signal_pending(tsk))
+ return sock_intr_errno(*timeo_p);
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk->sk_write_pending++;
+ done = sk_wait_event(sk, timeo_p,
+ !sk->sk_err &&
+ !((1 << sk->sk_state) &
+ ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ sk->sk_write_pending--;
+ } while (!done);
+ return 0;
+}
+EXPORT_SYMBOL(sk_stream_wait_connect);
+
+/**
+ * sk_stream_closing - Return 1 if we still have things to send in our buffers.
+ * @sk: socket to verify
+ */
+static inline int sk_stream_closing(struct sock *sk)
+{
+ return (1 << sk->sk_state) &
+ (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
+}
+
+void sk_stream_wait_close(struct sock *sk, long timeout)
+{
+ if (timeout) {
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(sk_sleep(sk), &wait);
+
+ do {
+ if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait))
+ break;
+ } while (!signal_pending(current) && timeout);
+
+ remove_wait_queue(sk_sleep(sk), &wait);
+ }
+}
+EXPORT_SYMBOL(sk_stream_wait_close);
+
+/**
+ * sk_stream_wait_memory - Wait for more memory for a socket
+ * @sk: socket to wait for memory
+ * @timeo_p: for how long
+ */
+int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
+{
+ int err = 0;
+ long vm_wait = 0;
+ long current_timeo = *timeo_p;
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ if (sk_stream_memory_free(sk))
+ current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+
+ while (1) {
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ goto do_error;
+ if (!*timeo_p)
+ goto do_eagain;
+ if (signal_pending(current))
+ goto do_interrupted;
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ if (sk_stream_memory_free(sk) && !vm_wait)
+ break;
+
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ sk->sk_write_pending++;
+ sk_wait_event(sk, &current_timeo, sk->sk_err ||
+ (sk->sk_shutdown & SEND_SHUTDOWN) ||
+ (sk_stream_memory_free(sk) &&
+ !vm_wait), &wait);
+ sk->sk_write_pending--;
+
+ if (vm_wait) {
+ vm_wait -= current_timeo;
+ current_timeo = *timeo_p;
+ if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
+ (current_timeo -= vm_wait) < 0)
+ current_timeo = 0;
+ vm_wait = 0;
+ }
+ *timeo_p = current_timeo;
+ }
+out:
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return err;
+
+do_error:
+ err = -EPIPE;
+ goto out;
+do_eagain:
+ /* Make sure that whenever EAGAIN is returned, EPOLLOUT event can
+ * be generated later.
+ * When TCP receives ACK packets that make room, tcp_check_space()
+ * only calls tcp_new_space() if SOCK_NOSPACE is set.
+ */
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ err = -EAGAIN;
+ goto out;
+do_interrupted:
+ err = sock_intr_errno(*timeo_p);
+ goto out;
+}
+EXPORT_SYMBOL(sk_stream_wait_memory);
+
+int sk_stream_error(struct sock *sk, int flags, int err)
+{
+ if (err == -EPIPE)
+ err = sock_error(sk) ? : -EPIPE;
+ if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
+ send_sig(SIGPIPE, current, 0);
+ return err;
+}
+EXPORT_SYMBOL(sk_stream_error);
+
+void sk_stream_kill_queues(struct sock *sk)
+{
+ /* First the read buffer. */
+ __skb_queue_purge(&sk->sk_receive_queue);
+
+ /* Next, the write queue. */
+ WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
+
+ /* Account for returned memory. */
+ sk_mem_reclaim(sk);
+
+ WARN_ON(sk->sk_wmem_queued);
+ WARN_ON(sk->sk_forward_alloc);
+
+ /* It is _impossible_ for the backlog to contain anything
+ * when we get here. All user references to this socket
+ * have gone away, only the net layer knows can touch it.
+ */
+}
+EXPORT_SYMBOL(sk_stream_kill_queues);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
new file mode 100644
index 000000000..0a0bf8062
--- /dev/null
+++ b/net/core/sysctl_net_core.c
@@ -0,0 +1,616 @@
+// SPDX-License-Identifier: GPL-2.0
+/* -*- linux-c -*-
+ * sysctl_net_core.c: sysctl interface to net core subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/core directory entry (empty =) ). [MS]
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/ratelimit.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include <net/ip.h>
+#include <net/sock.h>
+#include <net/net_ratelimit.h>
+#include <net/busy_poll.h>
+#include <net/pkt_sched.h>
+
+static int zero = 0;
+static int one = 1;
+static int two __maybe_unused = 2;
+static int min_sndbuf = SOCK_MIN_SNDBUF;
+static int min_rcvbuf = SOCK_MIN_RCVBUF;
+static int max_skb_frags = MAX_SKB_FRAGS;
+static long long_one __maybe_unused = 1;
+static long long_max __maybe_unused = LONG_MAX;
+
+static int net_msg_warn; /* Unused, but still a sysctl */
+
+int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0;
+EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
+
+#ifdef CONFIG_RPS
+static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ unsigned int orig_size, size;
+ int ret, i;
+ struct ctl_table tmp = {
+ .data = &size,
+ .maxlen = sizeof(size),
+ .mode = table->mode
+ };
+ struct rps_sock_flow_table *orig_sock_table, *sock_table;
+ static DEFINE_MUTEX(sock_flow_mutex);
+
+ mutex_lock(&sock_flow_mutex);
+
+ orig_sock_table = rcu_dereference_protected(rps_sock_flow_table,
+ lockdep_is_held(&sock_flow_mutex));
+ size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
+
+ ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+
+ if (write) {
+ if (size) {
+ if (size > 1<<29) {
+ /* Enforce limit to prevent overflow */
+ mutex_unlock(&sock_flow_mutex);
+ return -EINVAL;
+ }
+ size = roundup_pow_of_two(size);
+ if (size != orig_size) {
+ sock_table =
+ vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
+ if (!sock_table) {
+ mutex_unlock(&sock_flow_mutex);
+ return -ENOMEM;
+ }
+ rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
+ sock_table->mask = size - 1;
+ } else
+ sock_table = orig_sock_table;
+
+ for (i = 0; i < size; i++)
+ sock_table->ents[i] = RPS_NO_CPU;
+ } else
+ sock_table = NULL;
+
+ if (sock_table != orig_sock_table) {
+ rcu_assign_pointer(rps_sock_flow_table, sock_table);
+ if (sock_table) {
+ static_key_slow_inc(&rps_needed);
+ static_key_slow_inc(&rfs_needed);
+ }
+ if (orig_sock_table) {
+ static_key_slow_dec(&rps_needed);
+ static_key_slow_dec(&rfs_needed);
+ synchronize_rcu();
+ vfree(orig_sock_table);
+ }
+ }
+ }
+
+ mutex_unlock(&sock_flow_mutex);
+
+ return ret;
+}
+#endif /* CONFIG_RPS */
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+static DEFINE_MUTEX(flow_limit_update_mutex);
+
+static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct sd_flow_limit *cur;
+ struct softnet_data *sd;
+ cpumask_var_t mask;
+ int i, len, ret = 0;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (write) {
+ ret = cpumask_parse_user(buffer, *lenp, mask);
+ if (ret)
+ goto done;
+
+ mutex_lock(&flow_limit_update_mutex);
+ len = sizeof(*cur) + netdev_flow_limit_table_len;
+ for_each_possible_cpu(i) {
+ sd = &per_cpu(softnet_data, i);
+ cur = rcu_dereference_protected(sd->flow_limit,
+ lockdep_is_held(&flow_limit_update_mutex));
+ if (cur && !cpumask_test_cpu(i, mask)) {
+ RCU_INIT_POINTER(sd->flow_limit, NULL);
+ synchronize_rcu();
+ kfree(cur);
+ } else if (!cur && cpumask_test_cpu(i, mask)) {
+ cur = kzalloc_node(len, GFP_KERNEL,
+ cpu_to_node(i));
+ if (!cur) {
+ /* not unwinding previous changes */
+ ret = -ENOMEM;
+ goto write_unlock;
+ }
+ cur->num_buckets = netdev_flow_limit_table_len;
+ rcu_assign_pointer(sd->flow_limit, cur);
+ }
+ }
+write_unlock:
+ mutex_unlock(&flow_limit_update_mutex);
+ } else {
+ char kbuf[128];
+
+ if (*ppos || !*lenp) {
+ *lenp = 0;
+ goto done;
+ }
+
+ cpumask_clear(mask);
+ rcu_read_lock();
+ for_each_possible_cpu(i) {
+ sd = &per_cpu(softnet_data, i);
+ if (rcu_dereference(sd->flow_limit))
+ cpumask_set_cpu(i, mask);
+ }
+ rcu_read_unlock();
+
+ len = min(sizeof(kbuf) - 1, *lenp);
+ len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
+ if (!len) {
+ *lenp = 0;
+ goto done;
+ }
+ if (len < *lenp)
+ kbuf[len++] = '\n';
+ if (copy_to_user(buffer, kbuf, len)) {
+ ret = -EFAULT;
+ goto done;
+ }
+ *lenp = len;
+ *ppos += len;
+ }
+
+done:
+ free_cpumask_var(mask);
+ return ret;
+}
+
+static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ unsigned int old, *ptr;
+ int ret;
+
+ mutex_lock(&flow_limit_update_mutex);
+
+ ptr = table->data;
+ old = *ptr;
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (!ret && write && !is_power_of_2(*ptr)) {
+ *ptr = old;
+ ret = -EINVAL;
+ }
+
+ mutex_unlock(&flow_limit_update_mutex);
+ return ret;
+}
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
+#ifdef CONFIG_NET_SCHED
+static int set_default_qdisc(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char id[IFNAMSIZ];
+ struct ctl_table tbl = {
+ .data = id,
+ .maxlen = IFNAMSIZ,
+ };
+ int ret;
+
+ qdisc_get_default(id, IFNAMSIZ);
+
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ ret = qdisc_set_default(id);
+ return ret;
+}
+#endif
+
+static int proc_do_dev_weight(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret != 0)
+ return ret;
+
+ dev_rx_weight = weight_p * dev_weight_rx_bias;
+ dev_tx_weight = weight_p * dev_weight_tx_bias;
+
+ return ret;
+}
+
+static int proc_do_rss_key(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table fake_table;
+ char buf[NETDEV_RSS_KEY_LEN * 3];
+
+ snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key);
+ fake_table.data = buf;
+ fake_table.maxlen = sizeof(buf);
+ return proc_dostring(&fake_table, write, buffer, lenp, ppos);
+}
+
+#ifdef CONFIG_BPF_JIT
+static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret, jit_enable = *(int *)table->data;
+ struct ctl_table tmp = *table;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ tmp.data = &jit_enable;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ if (jit_enable < 2 ||
+ (jit_enable == 2 && bpf_dump_raw_ok(current_cred()))) {
+ *(int *)table->data = jit_enable;
+ if (jit_enable == 2)
+ pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n");
+ } else {
+ ret = -EPERM;
+ }
+ }
+ return ret;
+}
+
+# ifdef CONFIG_HAVE_EBPF_JIT
+static int
+proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+# endif /* CONFIG_HAVE_EBPF_JIT */
+
+static int
+proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif
+
+static struct ctl_table net_core_table[] = {
+#ifdef CONFIG_NET
+ {
+ .procname = "wmem_max",
+ .data = &sysctl_wmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_sndbuf,
+ },
+ {
+ .procname = "rmem_max",
+ .data = &sysctl_rmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_rcvbuf,
+ },
+ {
+ .procname = "wmem_default",
+ .data = &sysctl_wmem_default,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_sndbuf,
+ },
+ {
+ .procname = "rmem_default",
+ .data = &sysctl_rmem_default,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_rcvbuf,
+ },
+ {
+ .procname = "dev_weight",
+ .data = &weight_p,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_dev_weight,
+ },
+ {
+ .procname = "dev_weight_rx_bias",
+ .data = &dev_weight_rx_bias,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_dev_weight,
+ },
+ {
+ .procname = "dev_weight_tx_bias",
+ .data = &dev_weight_tx_bias,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_dev_weight,
+ },
+ {
+ .procname = "netdev_max_backlog",
+ .data = &netdev_max_backlog,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "netdev_rss_key",
+ .data = &netdev_rss_key,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_do_rss_key,
+ },
+#ifdef CONFIG_BPF_JIT
+ {
+ .procname = "bpf_jit_enable",
+ .data = &bpf_jit_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_bpf_enable,
+# ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ .extra1 = &one,
+ .extra2 = &one,
+# else
+ .extra1 = &zero,
+ .extra2 = &two,
+# endif
+ },
+# ifdef CONFIG_HAVE_EBPF_JIT
+ {
+ .procname = "bpf_jit_harden",
+ .data = &bpf_jit_harden,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax_bpf_restricted,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+ {
+ .procname = "bpf_jit_kallsyms",
+ .data = &bpf_jit_kallsyms,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax_bpf_restricted,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+# endif
+ {
+ .procname = "bpf_jit_limit",
+ .data = &bpf_jit_limit,
+ .maxlen = sizeof(long),
+ .mode = 0600,
+ .proc_handler = proc_dolongvec_minmax_bpf_restricted,
+ .extra1 = &long_one,
+ .extra2 = &bpf_jit_limit_max,
+ },
+#endif
+ {
+ .procname = "netdev_tstamp_prequeue",
+ .data = &netdev_tstamp_prequeue,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "message_cost",
+ .data = &net_ratelimit_state.interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "message_burst",
+ .data = &net_ratelimit_state.burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "optmem_max",
+ .data = &sysctl_optmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tstamp_allow_data",
+ .data = &sysctl_tstamp_allow_data,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one
+ },
+#ifdef CONFIG_RPS
+ {
+ .procname = "rps_sock_flow_entries",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = rps_sock_flow_sysctl
+ },
+#endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+ {
+ .procname = "flow_limit_cpu_bitmap",
+ .mode = 0644,
+ .proc_handler = flow_limit_cpu_sysctl
+ },
+ {
+ .procname = "flow_limit_table_len",
+ .data = &netdev_flow_limit_table_len,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = flow_limit_table_len_sysctl
+ },
+#endif /* CONFIG_NET_FLOW_LIMIT */
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ {
+ .procname = "busy_poll",
+ .data = &sysctl_net_busy_poll,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "busy_read",
+ .data = &sysctl_net_busy_read,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+#endif
+#ifdef CONFIG_NET_SCHED
+ {
+ .procname = "default_qdisc",
+ .mode = 0644,
+ .maxlen = IFNAMSIZ,
+ .proc_handler = set_default_qdisc
+ },
+#endif
+#endif /* CONFIG_NET */
+ {
+ .procname = "netdev_budget",
+ .data = &netdev_budget,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "warnings",
+ .data = &net_msg_warn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "max_skb_frags",
+ .data = &sysctl_max_skb_frags,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &max_skb_frags,
+ },
+ {
+ .procname = "netdev_budget_usecs",
+ .data = &netdev_budget_usecs,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "fb_tunnels_only_for_init_net",
+ .data = &sysctl_fb_tunnels_only_for_init_net,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ { }
+};
+
+static struct ctl_table netns_core_table[] = {
+ {
+ .procname = "somaxconn",
+ .data = &init_net.core.sysctl_somaxconn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .extra1 = &zero,
+ .proc_handler = proc_dointvec_minmax
+ },
+ { }
+};
+
+static __net_init int sysctl_core_net_init(struct net *net)
+{
+ struct ctl_table *tbl;
+
+ tbl = netns_core_table;
+ if (!net_eq(net, &init_net)) {
+ tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
+ if (tbl == NULL)
+ goto err_dup;
+
+ tbl[0].data = &net->core.sysctl_somaxconn;
+
+ /* Don't export any sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns) {
+ tbl[0].procname = NULL;
+ }
+ }
+
+ net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl);
+ if (net->core.sysctl_hdr == NULL)
+ goto err_reg;
+
+ return 0;
+
+err_reg:
+ if (tbl != netns_core_table)
+ kfree(tbl);
+err_dup:
+ return -ENOMEM;
+}
+
+static __net_exit void sysctl_core_net_exit(struct net *net)
+{
+ struct ctl_table *tbl;
+
+ tbl = net->core.sysctl_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->core.sysctl_hdr);
+ BUG_ON(tbl == netns_core_table);
+ kfree(tbl);
+}
+
+static __net_initdata struct pernet_operations sysctl_core_ops = {
+ .init = sysctl_core_net_init,
+ .exit = sysctl_core_net_exit,
+};
+
+static __init int sysctl_core_init(void)
+{
+ register_net_sysctl(&init_net, "net/core", net_core_table);
+ return register_pernet_subsys(&sysctl_core_ops);
+}
+
+fs_initcall(sysctl_core_init);
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
new file mode 100644
index 000000000..42689d5c4
--- /dev/null
+++ b/net/core/timestamping.c
@@ -0,0 +1,84 @@
+/*
+ * PTP 1588 clock support - support for timestamping in PHY devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/errqueue.h>
+#include <linux/phy.h>
+#include <linux/ptp_classify.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+
+static unsigned int classify(const struct sk_buff *skb)
+{
+ if (likely(skb->dev && skb->dev->phydev &&
+ skb->dev->phydev->drv))
+ return ptp_classify_raw(skb);
+ else
+ return PTP_CLASS_NONE;
+}
+
+void skb_clone_tx_timestamp(struct sk_buff *skb)
+{
+ struct phy_device *phydev;
+ struct sk_buff *clone;
+ unsigned int type;
+
+ if (!skb->sk)
+ return;
+
+ type = classify(skb);
+ if (type == PTP_CLASS_NONE)
+ return;
+
+ phydev = skb->dev->phydev;
+ if (likely(phydev->drv->txtstamp)) {
+ clone = skb_clone_sk(skb);
+ if (!clone)
+ return;
+ phydev->drv->txtstamp(phydev, clone, type);
+ }
+}
+EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
+
+bool skb_defer_rx_timestamp(struct sk_buff *skb)
+{
+ struct phy_device *phydev;
+ unsigned int type;
+
+ if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->drv)
+ return false;
+
+ if (skb_headroom(skb) < ETH_HLEN)
+ return false;
+
+ __skb_push(skb, ETH_HLEN);
+
+ type = ptp_classify_raw(skb);
+
+ __skb_pull(skb, ETH_HLEN);
+
+ if (type == PTP_CLASS_NONE)
+ return false;
+
+ phydev = skb->dev->phydev;
+ if (likely(phydev->drv->rxtstamp))
+ return phydev->drv->rxtstamp(phydev, skb, type);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp);
diff --git a/net/core/tso.c b/net/core/tso.c
new file mode 100644
index 000000000..43f4eba61
--- /dev/null
+++ b/net/core/tso.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <net/tso.h>
+#include <asm/unaligned.h>
+
+/* Calculate expected number of TX descriptors */
+int tso_count_descs(struct sk_buff *skb)
+{
+ /* The Marvell Way */
+ return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags;
+}
+EXPORT_SYMBOL(tso_count_descs);
+
+void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
+ int size, bool is_last)
+{
+ struct tcphdr *tcph;
+ int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ int mac_hdr_len = skb_network_offset(skb);
+
+ memcpy(hdr, skb->data, hdr_len);
+ if (!tso->ipv6) {
+ struct iphdr *iph = (void *)(hdr + mac_hdr_len);
+
+ iph->id = htons(tso->ip_id);
+ iph->tot_len = htons(size + hdr_len - mac_hdr_len);
+ tso->ip_id++;
+ } else {
+ struct ipv6hdr *iph = (void *)(hdr + mac_hdr_len);
+
+ iph->payload_len = htons(size + tcp_hdrlen(skb));
+ }
+ tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb));
+ put_unaligned_be32(tso->tcp_seq, &tcph->seq);
+
+ if (!is_last) {
+ /* Clear all special flags for not last packet */
+ tcph->psh = 0;
+ tcph->fin = 0;
+ tcph->rst = 0;
+ }
+}
+EXPORT_SYMBOL(tso_build_hdr);
+
+void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size)
+{
+ tso->tcp_seq += size;
+ tso->size -= size;
+ tso->data += size;
+
+ if ((tso->size == 0) &&
+ (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
+
+ /* Move to next segment */
+ tso->size = frag->size;
+ tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->next_frag_idx++;
+ }
+}
+EXPORT_SYMBOL(tso_build_data);
+
+void tso_start(struct sk_buff *skb, struct tso_t *tso)
+{
+ int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+
+ tso->ip_id = ntohs(ip_hdr(skb)->id);
+ tso->tcp_seq = ntohl(tcp_hdr(skb)->seq);
+ tso->next_frag_idx = 0;
+ tso->ipv6 = vlan_get_protocol(skb) == htons(ETH_P_IPV6);
+
+ /* Build first data */
+ tso->size = skb_headlen(skb) - hdr_len;
+ tso->data = skb->data + hdr_len;
+ if ((tso->size == 0) &&
+ (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
+
+ /* Move to next segment */
+ tso->size = frag->size;
+ tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->next_frag_idx++;
+ }
+}
+EXPORT_SYMBOL(tso_start);
diff --git a/net/core/utils.c b/net/core/utils.c
new file mode 100644
index 000000000..60045e9fe
--- /dev/null
+++ b/net/core/utils.c
@@ -0,0 +1,490 @@
+/*
+ * Generic address resultion entity
+ *
+ * Authors:
+ * net_random Alan Cox
+ * net_ratelimit Andi Kleen
+ * in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project
+ *
+ * Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
+#include <linux/mm.h>
+#include <linux/net.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/ratelimit.h>
+#include <linux/socket.h>
+
+#include <net/sock.h>
+#include <net/net_ratelimit.h>
+#include <net/ipv6.h>
+
+#include <asm/byteorder.h>
+#include <linux/uaccess.h>
+
+DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
+/*
+ * All net warning printk()s should be guarded by this function.
+ */
+int net_ratelimit(void)
+{
+ return __ratelimit(&net_ratelimit_state);
+}
+EXPORT_SYMBOL(net_ratelimit);
+
+/*
+ * Convert an ASCII string to binary IP.
+ * This is outside of net/ipv4/ because various code that uses IP addresses
+ * is otherwise not dependent on the TCP/IP stack.
+ */
+
+__be32 in_aton(const char *str)
+{
+ unsigned int l;
+ unsigned int val;
+ int i;
+
+ l = 0;
+ for (i = 0; i < 4; i++) {
+ l <<= 8;
+ if (*str != '\0') {
+ val = 0;
+ while (*str != '\0' && *str != '.' && *str != '\n') {
+ val *= 10;
+ val += *str - '0';
+ str++;
+ }
+ l |= val;
+ if (*str != '\0')
+ str++;
+ }
+ }
+ return htonl(l);
+}
+EXPORT_SYMBOL(in_aton);
+
+#define IN6PTON_XDIGIT 0x00010000
+#define IN6PTON_DIGIT 0x00020000
+#define IN6PTON_COLON_MASK 0x00700000
+#define IN6PTON_COLON_1 0x00100000 /* single : requested */
+#define IN6PTON_COLON_2 0x00200000 /* second : requested */
+#define IN6PTON_COLON_1_2 0x00400000 /* :: requested */
+#define IN6PTON_DOT 0x00800000 /* . */
+#define IN6PTON_DELIM 0x10000000
+#define IN6PTON_NULL 0x20000000 /* first/tail */
+#define IN6PTON_UNKNOWN 0x40000000
+
+static inline int xdigit2bin(char c, int delim)
+{
+ int val;
+
+ if (c == delim || c == '\0')
+ return IN6PTON_DELIM;
+ if (c == ':')
+ return IN6PTON_COLON_MASK;
+ if (c == '.')
+ return IN6PTON_DOT;
+
+ val = hex_to_bin(c);
+ if (val >= 0)
+ return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0);
+
+ if (delim == -1)
+ return IN6PTON_DELIM;
+ return IN6PTON_UNKNOWN;
+}
+
+/**
+ * in4_pton - convert an IPv4 address from literal to binary representation
+ * @src: the start of the IPv4 address string
+ * @srclen: the length of the string, -1 means strlen(src)
+ * @dst: the binary (u8[4] array) representation of the IPv4 address
+ * @delim: the delimiter of the IPv4 address in @src, -1 means no delimiter
+ * @end: A pointer to the end of the parsed string will be placed here
+ *
+ * Return one on success, return zero when any error occurs
+ * and @end will point to the end of the parsed string.
+ *
+ */
+int in4_pton(const char *src, int srclen,
+ u8 *dst,
+ int delim, const char **end)
+{
+ const char *s;
+ u8 *d;
+ u8 dbuf[4];
+ int ret = 0;
+ int i;
+ int w = 0;
+
+ if (srclen < 0)
+ srclen = strlen(src);
+ s = src;
+ d = dbuf;
+ i = 0;
+ while (1) {
+ int c;
+ c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
+ if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) {
+ goto out;
+ }
+ if (c & (IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+ if (w == 0)
+ goto out;
+ *d++ = w & 0xff;
+ w = 0;
+ i++;
+ if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+ if (i != 4)
+ goto out;
+ break;
+ }
+ goto cont;
+ }
+ w = (w * 10) + c;
+ if ((w & 0xffff) > 255) {
+ goto out;
+ }
+cont:
+ if (i >= 4)
+ goto out;
+ s++;
+ srclen--;
+ }
+ ret = 1;
+ memcpy(dst, dbuf, sizeof(dbuf));
+out:
+ if (end)
+ *end = s;
+ return ret;
+}
+EXPORT_SYMBOL(in4_pton);
+
+/**
+ * in6_pton - convert an IPv6 address from literal to binary representation
+ * @src: the start of the IPv6 address string
+ * @srclen: the length of the string, -1 means strlen(src)
+ * @dst: the binary (u8[16] array) representation of the IPv6 address
+ * @delim: the delimiter of the IPv6 address in @src, -1 means no delimiter
+ * @end: A pointer to the end of the parsed string will be placed here
+ *
+ * Return one on success, return zero when any error occurs
+ * and @end will point to the end of the parsed string.
+ *
+ */
+int in6_pton(const char *src, int srclen,
+ u8 *dst,
+ int delim, const char **end)
+{
+ const char *s, *tok = NULL;
+ u8 *d, *dc = NULL;
+ u8 dbuf[16];
+ int ret = 0;
+ int i;
+ int state = IN6PTON_COLON_1_2 | IN6PTON_XDIGIT | IN6PTON_NULL;
+ int w = 0;
+
+ memset(dbuf, 0, sizeof(dbuf));
+
+ s = src;
+ d = dbuf;
+ if (srclen < 0)
+ srclen = strlen(src);
+
+ while (1) {
+ int c;
+
+ c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
+ if (!(c & state))
+ goto out;
+ if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+ /* process one 16-bit word */
+ if (!(state & IN6PTON_NULL)) {
+ *d++ = (w >> 8) & 0xff;
+ *d++ = w & 0xff;
+ }
+ w = 0;
+ if (c & IN6PTON_DELIM) {
+ /* We've processed last word */
+ break;
+ }
+ /*
+ * COLON_1 => XDIGIT
+ * COLON_2 => XDIGIT|DELIM
+ * COLON_1_2 => COLON_2
+ */
+ switch (state & IN6PTON_COLON_MASK) {
+ case IN6PTON_COLON_2:
+ dc = d;
+ state = IN6PTON_XDIGIT | IN6PTON_DELIM;
+ if (dc - dbuf >= sizeof(dbuf))
+ state |= IN6PTON_NULL;
+ break;
+ case IN6PTON_COLON_1|IN6PTON_COLON_1_2:
+ state = IN6PTON_XDIGIT | IN6PTON_COLON_2;
+ break;
+ case IN6PTON_COLON_1:
+ state = IN6PTON_XDIGIT;
+ break;
+ case IN6PTON_COLON_1_2:
+ state = IN6PTON_COLON_2;
+ break;
+ default:
+ state = 0;
+ }
+ tok = s + 1;
+ goto cont;
+ }
+
+ if (c & IN6PTON_DOT) {
+ ret = in4_pton(tok ? tok : s, srclen + (int)(s - tok), d, delim, &s);
+ if (ret > 0) {
+ d += 4;
+ break;
+ }
+ goto out;
+ }
+
+ w = (w << 4) | (0xff & c);
+ state = IN6PTON_COLON_1 | IN6PTON_DELIM;
+ if (!(w & 0xf000)) {
+ state |= IN6PTON_XDIGIT;
+ }
+ if (!dc && d + 2 < dbuf + sizeof(dbuf)) {
+ state |= IN6PTON_COLON_1_2;
+ state &= ~IN6PTON_DELIM;
+ }
+ if (d + 2 >= dbuf + sizeof(dbuf)) {
+ state &= ~(IN6PTON_COLON_1|IN6PTON_COLON_1_2);
+ }
+cont:
+ if ((dc && d + 4 < dbuf + sizeof(dbuf)) ||
+ d + 4 == dbuf + sizeof(dbuf)) {
+ state |= IN6PTON_DOT;
+ }
+ if (d >= dbuf + sizeof(dbuf)) {
+ state &= ~(IN6PTON_XDIGIT|IN6PTON_COLON_MASK);
+ }
+ s++;
+ srclen--;
+ }
+
+ i = 15; d--;
+
+ if (dc) {
+ while (d >= dc)
+ dst[i--] = *d--;
+ while (i >= dc - dbuf)
+ dst[i--] = 0;
+ while (i >= 0)
+ dst[i--] = *d--;
+ } else
+ memcpy(dst, dbuf, sizeof(dbuf));
+
+ ret = 1;
+out:
+ if (end)
+ *end = s;
+ return ret;
+}
+EXPORT_SYMBOL(in6_pton);
+
+static int inet4_pton(const char *src, u16 port_num,
+ struct sockaddr_storage *addr)
+{
+ struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+ int srclen = strlen(src);
+
+ if (srclen > INET_ADDRSTRLEN)
+ return -EINVAL;
+
+ if (in4_pton(src, srclen, (u8 *)&addr4->sin_addr.s_addr,
+ '\n', NULL) == 0)
+ return -EINVAL;
+
+ addr4->sin_family = AF_INET;
+ addr4->sin_port = htons(port_num);
+
+ return 0;
+}
+
+static int inet6_pton(struct net *net, const char *src, u16 port_num,
+ struct sockaddr_storage *addr)
+{
+ struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+ const char *scope_delim;
+ int srclen = strlen(src);
+
+ if (srclen > INET6_ADDRSTRLEN)
+ return -EINVAL;
+
+ if (in6_pton(src, srclen, (u8 *)&addr6->sin6_addr.s6_addr,
+ '%', &scope_delim) == 0)
+ return -EINVAL;
+
+ if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL &&
+ src + srclen != scope_delim && *scope_delim == '%') {
+ struct net_device *dev;
+ char scope_id[16];
+ size_t scope_len = min_t(size_t, sizeof(scope_id) - 1,
+ src + srclen - scope_delim - 1);
+
+ memcpy(scope_id, scope_delim + 1, scope_len);
+ scope_id[scope_len] = '\0';
+
+ dev = dev_get_by_name(net, scope_id);
+ if (dev) {
+ addr6->sin6_scope_id = dev->ifindex;
+ dev_put(dev);
+ } else if (kstrtouint(scope_id, 0, &addr6->sin6_scope_id)) {
+ return -EINVAL;
+ }
+ }
+
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = htons(port_num);
+
+ return 0;
+}
+
+/**
+ * inet_pton_with_scope - convert an IPv4/IPv6 and port to socket address
+ * @net: net namespace (used for scope handling)
+ * @af: address family, AF_INET, AF_INET6 or AF_UNSPEC for either
+ * @src: the start of the address string
+ * @port: the start of the port string (or NULL for none)
+ * @addr: output socket address
+ *
+ * Return zero on success, return errno when any error occurs.
+ */
+int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
+ const char *src, const char *port, struct sockaddr_storage *addr)
+{
+ u16 port_num;
+ int ret = -EINVAL;
+
+ if (port) {
+ if (kstrtou16(port, 0, &port_num))
+ return -EINVAL;
+ } else {
+ port_num = 0;
+ }
+
+ switch (af) {
+ case AF_INET:
+ ret = inet4_pton(src, port_num, addr);
+ break;
+ case AF_INET6:
+ ret = inet6_pton(net, src, port_num, addr);
+ break;
+ case AF_UNSPEC:
+ ret = inet4_pton(src, port_num, addr);
+ if (ret)
+ ret = inet6_pton(net, src, port_num, addr);
+ break;
+ default:
+ pr_err("unexpected address family %d\n", af);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(inet_pton_with_scope);
+
+bool inet_addr_is_any(struct sockaddr *addr)
+{
+ if (addr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
+ const struct sockaddr_in6 in6_any =
+ { .sin6_addr = IN6ADDR_ANY_INIT };
+
+ if (!memcmp(in6->sin6_addr.s6_addr,
+ in6_any.sin6_addr.s6_addr, 16))
+ return true;
+ } else if (addr->sa_family == AF_INET) {
+ struct sockaddr_in *in = (struct sockaddr_in *)addr;
+
+ if (in->sin_addr.s_addr == htonl(INADDR_ANY))
+ return true;
+ } else {
+ pr_warn("unexpected address family %u\n", addr->sa_family);
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(inet_addr_is_any);
+
+void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
+ __be32 from, __be32 to, bool pseudohdr)
+{
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ csum_replace4(sum, from, to);
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+ skb->csum = ~csum_add(csum_sub(~(skb->csum),
+ (__force __wsum)from),
+ (__force __wsum)to);
+ } else if (pseudohdr)
+ *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum),
+ (__force __wsum)from),
+ (__force __wsum)to));
+}
+EXPORT_SYMBOL(inet_proto_csum_replace4);
+
+/**
+ * inet_proto_csum_replace16 - update layer 4 header checksum field
+ * @sum: Layer 4 header checksum field
+ * @skb: sk_buff for the packet
+ * @from: old IPv6 address
+ * @to: new IPv6 address
+ * @pseudohdr: True if layer 4 header checksum includes pseudoheader
+ *
+ * Update layer 4 header as per the update in IPv6 src/dst address.
+ *
+ * There is no need to update skb->csum in this function, because update in two
+ * fields a.) IPv6 src/dst address and b.) L4 header checksum cancels each other
+ * for skb->csum calculation. Whereas inet_proto_csum_replace4 function needs to
+ * update skb->csum, because update in 3 fields a.) IPv4 src/dst address,
+ * b.) IPv4 Header checksum and c.) L4 header checksum results in same diff as
+ * L4 Header checksum for skb->csum calculation.
+ */
+void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
+ const __be32 *from, const __be32 *to,
+ bool pseudohdr)
+{
+ __be32 diff[] = {
+ ~from[0], ~from[1], ~from[2], ~from[3],
+ to[0], to[1], to[2], to[3],
+ };
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ *sum = csum_fold(csum_partial(diff, sizeof(diff),
+ ~csum_unfold(*sum)));
+ } else if (pseudohdr)
+ *sum = ~csum_fold(csum_partial(diff, sizeof(diff),
+ csum_unfold(*sum)));
+}
+EXPORT_SYMBOL(inet_proto_csum_replace16);
+
+void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb,
+ __wsum diff, bool pseudohdr)
+{
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum)));
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+ skb->csum = ~csum_add(diff, ~skb->csum);
+ } else if (pseudohdr) {
+ *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum)));
+ }
+}
+EXPORT_SYMBOL(inet_proto_csum_replace_by_diff);
diff --git a/net/core/xdp.c b/net/core/xdp.c
new file mode 100644
index 000000000..89b6785ce
--- /dev/null
+++ b/net/core/xdp.c
@@ -0,0 +1,400 @@
+/* net/core/xdp.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2. See COPYING.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/rhashtable.h>
+#include <net/page_pool.h>
+
+#include <net/xdp.h>
+
+#define REG_STATE_NEW 0x0
+#define REG_STATE_REGISTERED 0x1
+#define REG_STATE_UNREGISTERED 0x2
+#define REG_STATE_UNUSED 0x3
+
+static DEFINE_IDA(mem_id_pool);
+static DEFINE_MUTEX(mem_id_lock);
+#define MEM_ID_MAX 0xFFFE
+#define MEM_ID_MIN 1
+static int mem_id_next = MEM_ID_MIN;
+
+static bool mem_id_init; /* false */
+static struct rhashtable *mem_id_ht;
+
+struct xdp_mem_allocator {
+ struct xdp_mem_info mem;
+ union {
+ void *allocator;
+ struct page_pool *page_pool;
+ struct zero_copy_allocator *zc_alloc;
+ };
+ struct rhash_head node;
+ struct rcu_head rcu;
+};
+
+static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
+{
+ const u32 *k = data;
+ const u32 key = *k;
+
+ BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id)
+ != sizeof(u32));
+
+ /* Use cyclic increasing ID as direct hash key */
+ return key;
+}
+
+static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct xdp_mem_allocator *xa = ptr;
+ u32 mem_id = *(u32 *)arg->key;
+
+ return xa->mem.id != mem_id;
+}
+
+static const struct rhashtable_params mem_id_rht_params = {
+ .nelem_hint = 64,
+ .head_offset = offsetof(struct xdp_mem_allocator, node),
+ .key_offset = offsetof(struct xdp_mem_allocator, mem.id),
+ .key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id),
+ .max_size = MEM_ID_MAX,
+ .min_size = 8,
+ .automatic_shrinking = true,
+ .hashfn = xdp_mem_id_hashfn,
+ .obj_cmpfn = xdp_mem_id_cmp,
+};
+
+static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
+{
+ struct xdp_mem_allocator *xa;
+
+ xa = container_of(rcu, struct xdp_mem_allocator, rcu);
+
+ /* Allow this ID to be reused */
+ ida_simple_remove(&mem_id_pool, xa->mem.id);
+
+ /* Notice, driver is expected to free the *allocator,
+ * e.g. page_pool, and MUST also use RCU free.
+ */
+
+ /* Poison memory */
+ xa->mem.id = 0xFFFF;
+ xa->mem.type = 0xF0F0;
+ xa->allocator = (void *)0xDEAD9001;
+
+ kfree(xa);
+}
+
+static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
+{
+ struct xdp_mem_allocator *xa;
+ int id = xdp_rxq->mem.id;
+
+ if (id == 0)
+ return;
+
+ mutex_lock(&mem_id_lock);
+
+ xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
+ if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+ call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+
+ mutex_unlock(&mem_id_lock);
+}
+
+void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
+{
+ /* Simplify driver cleanup code paths, allow unreg "unused" */
+ if (xdp_rxq->reg_state == REG_STATE_UNUSED)
+ return;
+
+ WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
+
+ __xdp_rxq_info_unreg_mem_model(xdp_rxq);
+
+ xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
+ xdp_rxq->dev = NULL;
+
+ /* Reset mem info to defaults */
+ xdp_rxq->mem.id = 0;
+ xdp_rxq->mem.type = 0;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
+
+static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
+{
+ memset(xdp_rxq, 0, sizeof(*xdp_rxq));
+}
+
+/* Returns 0 on success, negative on failure */
+int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+ struct net_device *dev, u32 queue_index)
+{
+ if (xdp_rxq->reg_state == REG_STATE_UNUSED) {
+ WARN(1, "Driver promised not to register this");
+ return -EINVAL;
+ }
+
+ if (xdp_rxq->reg_state == REG_STATE_REGISTERED) {
+ WARN(1, "Missing unregister, handled but fix driver");
+ xdp_rxq_info_unreg(xdp_rxq);
+ }
+
+ if (!dev) {
+ WARN(1, "Missing net_device from driver");
+ return -ENODEV;
+ }
+
+ /* State either UNREGISTERED or NEW */
+ xdp_rxq_info_init(xdp_rxq);
+ xdp_rxq->dev = dev;
+ xdp_rxq->queue_index = queue_index;
+
+ xdp_rxq->reg_state = REG_STATE_REGISTERED;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_reg);
+
+void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
+{
+ xdp_rxq->reg_state = REG_STATE_UNUSED;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_unused);
+
+bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
+{
+ return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
+
+static int __mem_id_init_hash_table(void)
+{
+ struct rhashtable *rht;
+ int ret;
+
+ if (unlikely(mem_id_init))
+ return 0;
+
+ rht = kzalloc(sizeof(*rht), GFP_KERNEL);
+ if (!rht)
+ return -ENOMEM;
+
+ ret = rhashtable_init(rht, &mem_id_rht_params);
+ if (ret < 0) {
+ kfree(rht);
+ return ret;
+ }
+ mem_id_ht = rht;
+ smp_mb(); /* mutex lock should provide enough pairing */
+ mem_id_init = true;
+
+ return 0;
+}
+
+/* Allocate a cyclic ID that maps to allocator pointer.
+ * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
+ *
+ * Caller must lock mem_id_lock.
+ */
+static int __mem_id_cyclic_get(gfp_t gfp)
+{
+ int retries = 1;
+ int id;
+
+again:
+ id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp);
+ if (id < 0) {
+ if (id == -ENOSPC) {
+ /* Cyclic allocator, reset next id */
+ if (retries--) {
+ mem_id_next = MEM_ID_MIN;
+ goto again;
+ }
+ }
+ return id; /* errno */
+ }
+ mem_id_next = id + 1;
+
+ return id;
+}
+
+static bool __is_supported_mem_type(enum xdp_mem_type type)
+{
+ if (type == MEM_TYPE_PAGE_POOL)
+ return is_page_pool_compiled_in();
+
+ if (type >= MEM_TYPE_MAX)
+ return false;
+
+ return true;
+}
+
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+ enum xdp_mem_type type, void *allocator)
+{
+ struct xdp_mem_allocator *xdp_alloc;
+ gfp_t gfp = GFP_KERNEL;
+ int id, errno, ret;
+ void *ptr;
+
+ if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+ WARN(1, "Missing register, driver bug");
+ return -EFAULT;
+ }
+
+ if (!__is_supported_mem_type(type))
+ return -EOPNOTSUPP;
+
+ xdp_rxq->mem.type = type;
+
+ if (!allocator) {
+ if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
+ return -EINVAL; /* Setup time check page_pool req */
+ return 0;
+ }
+
+ /* Delay init of rhashtable to save memory if feature isn't used */
+ if (!mem_id_init) {
+ mutex_lock(&mem_id_lock);
+ ret = __mem_id_init_hash_table();
+ mutex_unlock(&mem_id_lock);
+ if (ret < 0) {
+ WARN_ON(1);
+ return ret;
+ }
+ }
+
+ xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
+ if (!xdp_alloc)
+ return -ENOMEM;
+
+ mutex_lock(&mem_id_lock);
+ id = __mem_id_cyclic_get(gfp);
+ if (id < 0) {
+ errno = id;
+ goto err;
+ }
+ xdp_rxq->mem.id = id;
+ xdp_alloc->mem = xdp_rxq->mem;
+ xdp_alloc->allocator = allocator;
+
+ /* Insert allocator into ID lookup table */
+ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
+ if (IS_ERR(ptr)) {
+ errno = PTR_ERR(ptr);
+ goto err;
+ }
+
+ mutex_unlock(&mem_id_lock);
+
+ return 0;
+err:
+ mutex_unlock(&mem_id_lock);
+ kfree(xdp_alloc);
+ return errno;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
+
+/* XDP RX runs under NAPI protection, and in different delivery error
+ * scenarios (e.g. queue full), it is possible to return the xdp_frame
+ * while still leveraging this protection. The @napi_direct boolian
+ * is used for those calls sites. Thus, allowing for faster recycling
+ * of xdp_frames/pages in those cases.
+ */
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+ unsigned long handle)
+{
+ struct xdp_mem_allocator *xa;
+ struct page *page;
+
+ switch (mem->type) {
+ case MEM_TYPE_PAGE_POOL:
+ rcu_read_lock();
+ /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
+ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ page = virt_to_head_page(data);
+ if (xa) {
+ napi_direct &= !xdp_return_frame_no_direct();
+ page_pool_put_page(xa->page_pool, page, napi_direct);
+ } else {
+ put_page(page);
+ }
+ rcu_read_unlock();
+ break;
+ case MEM_TYPE_PAGE_SHARED:
+ page_frag_free(data);
+ break;
+ case MEM_TYPE_PAGE_ORDER0:
+ page = virt_to_page(data); /* Assumes order0 page*/
+ put_page(page);
+ break;
+ case MEM_TYPE_ZERO_COPY:
+ /* NB! Only valid from an xdp_buff! */
+ rcu_read_lock();
+ /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
+ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ xa->zc_alloc->free(xa->zc_alloc, handle);
+ rcu_read_unlock();
+ default:
+ /* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+ break;
+ }
+}
+
+void xdp_return_frame(struct xdp_frame *xdpf)
+{
+ __xdp_return(xdpf->data, &xdpf->mem, false, 0);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame);
+
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
+{
+ __xdp_return(xdpf->data, &xdpf->mem, true, 0);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
+
+void xdp_return_buff(struct xdp_buff *xdp)
+{
+ __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
+}
+EXPORT_SYMBOL_GPL(xdp_return_buff);
+
+int xdp_attachment_query(struct xdp_attachment_info *info,
+ struct netdev_bpf *bpf)
+{
+ bpf->prog_id = info->prog ? info->prog->aux->id : 0;
+ bpf->prog_flags = info->prog ? info->flags : 0;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_attachment_query);
+
+bool xdp_attachment_flags_ok(struct xdp_attachment_info *info,
+ struct netdev_bpf *bpf)
+{
+ if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) {
+ NL_SET_ERR_MSG(bpf->extack,
+ "program loaded with different flags");
+ return false;
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok);
+
+void xdp_attachment_setup(struct xdp_attachment_info *info,
+ struct netdev_bpf *bpf)
+{
+ if (info->prog)
+ bpf_prog_put(info->prog);
+ info->prog = bpf->prog;
+ info->flags = bpf->flags;
+}
+EXPORT_SYMBOL_GPL(xdp_attachment_setup);